aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--AUTHORS1
-rw-r--r--cmd/ztest.c89
-rw-r--r--config/always-arch.m42
-rw-r--r--include/Makefile.am2
-rw-r--r--include/os/freebsd/spl/sys/ccompile.h2
-rw-r--r--include/os/linux/kernel/linux/simd_powerpc.h34
-rw-r--r--include/sys/blake3.h120
-rw-r--r--include/sys/zfs_chksum.h48
-rw-r--r--include/sys/zfs_ioctl.h3
-rw-r--r--include/sys/zio.h1
-rw-r--r--include/sys/zio_checksum.h12
-rw-r--r--include/zfeature_common.h1
-rw-r--r--lib/libicp/Makefile.am25
-rw-r--r--lib/libspl/include/sys/simd.h18
-rw-r--r--lib/libzfs/libzfs.abi9
-rw-r--r--lib/libzpool/Makefile.am2
-rw-r--r--man/man7/zfsprops.77
-rw-r--r--man/man7/zpool-features.78
-rw-r--r--module/Kbuild.in32
-rw-r--r--module/Makefile.bsd34
-rw-r--r--module/icp/algs/blake3/blake3.c732
-rw-r--r--module/icp/algs/blake3/blake3_generic.c202
-rw-r--r--module/icp/algs/blake3/blake3_impl.c256
-rw-r--r--module/icp/algs/blake3/blake3_impl.h213
-rw-r--r--module/icp/algs/blake3/blake3_x86-64.c248
-rw-r--r--module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S2450
-rw-r--r--module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S2463
-rw-r--r--module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S2823
-rw-r--r--module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S3064
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_avx2.S1845
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_avx512.S2618
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_sse2.S2323
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_sse41.S2058
-rw-r--r--module/zcommon/zfeature_common.c31
-rw-r--r--module/zcommon/zfs_prop.c8
-rw-r--r--module/zfs/blake3_zfs.c113
-rw-r--r--module/zfs/spa_misc.c3
-rw-r--r--module/zfs/zfs_chksum.c316
-rw-r--r--module/zfs/zio_checksum.c6
-rw-r--r--tests/runfiles/common.run4
-rw-r--r--tests/zfs-tests/cmd/.gitignore1
-rw-r--r--tests/zfs-tests/cmd/Makefile.am6
-rw-r--r--tests/zfs-tests/cmd/checksum/blake3_test.c575
-rw-r--r--tests/zfs-tests/cmd/checksum/edonr_test.c3
-rw-r--r--tests/zfs-tests/cmd/checksum/sha2_test.c3
-rw-r--r--tests/zfs-tests/cmd/checksum/skein_test.c3
-rw-r--r--tests/zfs-tests/include/commands.cfg1
-rw-r--r--tests/zfs-tests/include/properties.shlib2
-rw-r--r--tests/zfs-tests/tests/Makefile.am1
-rw-r--r--tests/zfs-tests/tests/functional/checksum/default.cfg2
-rwxr-xr-xtests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh30
-rwxr-xr-xtests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh2
-rw-r--r--tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg1
53 files changed, 22804 insertions, 52 deletions
diff --git a/AUTHORS b/AUTHORS
index aab8bf29c..86083ba87 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -285,6 +285,7 @@ CONTRIBUTORS:
Tim Connors <[email protected]>
Tim Crawford <[email protected]>
Tim Haley <[email protected]>
+ Tino Reichardt <[email protected]>
Tobin Harding <[email protected]>
Tom Caputi <[email protected]>
Tom Matthews <[email protected]>
diff --git a/cmd/ztest.c b/cmd/ztest.c
index ca05cf265..95f6107ff 100644
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -121,6 +121,7 @@
#include <sys/zfeature.h>
#include <sys/dsl_userhold.h>
#include <sys/abd.h>
+#include <sys/blake3.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
@@ -417,6 +418,7 @@ ztest_func_t ztest_device_removal;
ztest_func_t ztest_spa_checkpoint_create_discard;
ztest_func_t ztest_initialize;
ztest_func_t ztest_trim;
+ztest_func_t ztest_blake3;
ztest_func_t ztest_fletcher;
ztest_func_t ztest_fletcher_incr;
ztest_func_t ztest_verify_dnode_bt;
@@ -470,6 +472,7 @@ ztest_info_t ztest_info[] = {
ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely),
ZTI_INIT(ztest_initialize, 1, &zopt_sometimes),
ZTI_INIT(ztest_trim, 1, &zopt_sometimes),
+ ZTI_INIT(ztest_blake3, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher, 1, &zopt_rarely),
ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely),
ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes),
@@ -6374,6 +6377,92 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id)
}
void
+ztest_blake3(ztest_ds_t *zd, uint64_t id)
+{
+ (void) zd, (void) id;
+ hrtime_t end = gethrtime() + NANOSEC;
+ zio_cksum_salt_t salt;
+ void *salt_ptr = &salt.zcs_bytes;
+ struct abd *abd_data, *abd_meta;
+ void *buf, *templ;
+ int i, *ptr;
+ uint32_t size;
+ BLAKE3_CTX ctx;
+
+ size = ztest_random_blocksize();
+ buf = umem_alloc(size, UMEM_NOFAIL);
+ abd_data = abd_alloc(size, B_FALSE);
+ abd_meta = abd_alloc(size, B_TRUE);
+
+ for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++)
+ *ptr = ztest_random(UINT_MAX);
+ memset(salt_ptr, 'A', 32);
+
+ abd_copy_from_buf_off(abd_data, buf, 0, size);
+ abd_copy_from_buf_off(abd_meta, buf, 0, size);
+
+ while (gethrtime() <= end) {
+ int run_count = 100;
+ zio_cksum_t zc_ref1, zc_ref2;
+ zio_cksum_t zc_res1, zc_res2;
+
+ void *ref1 = &zc_ref1;
+ void *ref2 = &zc_ref2;
+ void *res1 = &zc_res1;
+ void *res2 = &zc_res2;
+
+ /* BLAKE3_KEY_LEN = 32 */
+ VERIFY0(blake3_set_impl_name("generic"));
+ templ = abd_checksum_blake3_tmpl_init(&salt);
+ Blake3_InitKeyed(&ctx, salt_ptr);
+ Blake3_Update(&ctx, buf, size);
+ Blake3_Final(&ctx, ref1);
+ zc_ref2 = zc_ref1;
+ ZIO_CHECKSUM_BSWAP(&zc_ref2);
+ abd_checksum_blake3_tmpl_free(templ);
+
+ VERIFY0(blake3_set_impl_name("cycle"));
+ while (run_count-- > 0) {
+
+ /* Test current implementation */
+ Blake3_InitKeyed(&ctx, salt_ptr);
+ Blake3_Update(&ctx, buf, size);
+ Blake3_Final(&ctx, res1);
+ zc_res2 = zc_res1;
+ ZIO_CHECKSUM_BSWAP(&zc_res2);
+
+ VERIFY0(memcmp(ref1, res1, 32));
+ VERIFY0(memcmp(ref2, res2, 32));
+
+ /* Test ABD - data */
+ templ = abd_checksum_blake3_tmpl_init(&salt);
+ abd_checksum_blake3_native(abd_data, size,
+ templ, &zc_res1);
+ abd_checksum_blake3_byteswap(abd_data, size,
+ templ, &zc_res2);
+
+ VERIFY0(memcmp(ref1, res1, 32));
+ VERIFY0(memcmp(ref2, res2, 32));
+
+ /* Test ABD - metadata */
+ abd_checksum_blake3_native(abd_meta, size,
+ templ, &zc_res1);
+ abd_checksum_blake3_byteswap(abd_meta, size,
+ templ, &zc_res2);
+ abd_checksum_blake3_tmpl_free(templ);
+
+ VERIFY0(memcmp(ref1, res1, 32));
+ VERIFY0(memcmp(ref2, res2, 32));
+
+ }
+ }
+
+ abd_free(abd_data);
+ abd_free(abd_meta);
+ umem_free(buf, size);
+}
+
+void
ztest_fletcher(ztest_ds_t *zd, uint64_t id)
{
(void) zd, (void) id;
diff --git a/config/always-arch.m4 b/config/always-arch.m4
index 02c8e4775..f7090a482 100644
--- a/config/always-arch.m4
+++ b/config/always-arch.m4
@@ -30,6 +30,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [
;;
esac
+ AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64)
AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64)
AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc)
+ AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64)
])
diff --git a/include/Makefile.am b/include/Makefile.am
index eee989d4a..1a7f67e9c 100644
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -23,6 +23,7 @@ COMMON_H = \
sys/avl.h \
sys/avl_impl.h \
sys/bitops.h \
+ sys/blake3.h \
sys/blkptr.h \
sys/bplist.h \
sys/bpobj.h \
@@ -117,6 +118,7 @@ COMMON_H = \
sys/zfeature.h \
sys/zfs_acl.h \
sys/zfs_bootenv.h \
+ sys/zfs_chksum.h \
sys/zfs_context.h \
sys/zfs_debug.h \
sys/zfs_delay.h \
diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h
index a46a3a18b..90b077a7b 100644
--- a/include/os/freebsd/spl/sys/ccompile.h
+++ b/include/os/freebsd/spl/sys/ccompile.h
@@ -74,10 +74,12 @@ extern "C" {
#ifndef LOCORE
#ifndef HAVE_RPC_TYPES
+#ifndef _KERNEL
typedef int bool_t;
typedef int enum_t;
#endif
#endif
+#endif
#ifndef __cplusplus
#define __init
diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h
index 108cef22f..31e51ea20 100644
--- a/include/os/linux/kernel/linux/simd_powerpc.h
+++ b/include/os/linux/kernel/linux/simd_powerpc.h
@@ -57,25 +57,45 @@
#include <sys/types.h>
#include <linux/version.h>
-#define kfpu_allowed() 1
-#define kfpu_begin() \
- { \
- preempt_disable(); \
- enable_kernel_altivec(); \
- }
+#define kfpu_allowed() 1
+
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
#define kfpu_end() \
{ \
+ disable_kernel_vsx(); \
disable_kernel_altivec(); \
preempt_enable(); \
}
+#define kfpu_begin() \
+ { \
+ preempt_disable(); \
+ enable_kernel_altivec(); \
+ enable_kernel_vsx(); \
+ }
#else
-/* seems that before 4.5 no-one bothered disabling ... */
+/* seems that before 4.5 no-one bothered */
+#define kfpu_begin()
#define kfpu_end() preempt_enable()
#endif
#define kfpu_init() 0
#define kfpu_fini() ((void) 0)
+static inline boolean_t
+zfs_vsx_available(void)
+{
+ boolean_t res;
+#if defined(__powerpc64__)
+ u64 msr;
+#else
+ u32 msr;
+#endif
+ kfpu_begin();
+ __asm volatile("mfmsr %0" : "=r"(msr));
+ res = (msr & 0x800000) != 0;
+ kfpu_end();
+ return (res);
+}
+
/*
* Check if AltiVec instruction set is available
*/
diff --git a/include/sys/blake3.h b/include/sys/blake3.h
new file mode 100644
index 000000000..e6650372c
--- /dev/null
+++ b/include/sys/blake3.h
@@ -0,0 +1,120 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021 Tino Reichardt <[email protected]>
+ */
+
+#ifndef BLAKE3_H
+#define BLAKE3_H
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#else
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_MAX_DEPTH 54
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+
+/*
+ * This struct is a private implementation detail.
+ * It has to be here because it's part of BLAKE3_CTX below.
+ */
+typedef struct {
+ uint32_t cv[8];
+ uint64_t chunk_counter;
+ uint8_t buf[BLAKE3_BLOCK_LEN];
+ uint8_t buf_len;
+ uint8_t blocks_compressed;
+ uint8_t flags;
+} blake3_chunk_state_t;
+
+typedef struct {
+ uint32_t key[8];
+ blake3_chunk_state_t chunk;
+ uint8_t cv_stack_len;
+
+ /*
+ * The stack size is MAX_DEPTH + 1 because we do lazy merging. For
+ * example, with 7 chunks, we have 3 entries in the stack. Adding an
+ * 8th chunk requires a 4th entry, rather than merging everything down
+ * to 1, because we don't know whether more input is coming. This is
+ * different from how the reference implementation does things.
+ */
+ uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+
+ /* const blake3_impl_ops_t *ops */
+ const void *ops;
+} BLAKE3_CTX;
+
+/* init the context for hash operation */
+void Blake3_Init(BLAKE3_CTX *ctx);
+
+/* init the context for a MAC and/or tree hash operation */
+void Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]);
+
+/* process the input bytes */
+void Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t input_len);
+
+/* finalize the hash computation and output the result */
+void Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out);
+
+/* finalize the hash computation and output the result */
+void Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out,
+ size_t out_len);
+
+/* return number of supported implementations */
+extern int blake3_get_impl_count(void);
+
+/* return id of selected implementation */
+extern int blake3_get_impl_id(void);
+
+/* return name of selected implementation */
+extern const char *blake3_get_impl_name(void);
+
+/* setup id as fastest implementation */
+extern void blake3_set_impl_fastest(uint32_t id);
+
+/* set implementation by id */
+extern void blake3_set_impl_id(uint32_t id);
+
+/* set implementation by name */
+extern int blake3_set_impl_name(const char *name);
+
+/* set startup implementation */
+extern void blake3_setup_impl(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_H */
diff --git a/include/sys/zfs_chksum.h b/include/sys/zfs_chksum.h
new file mode 100644
index 000000000..cfd07bd0f
--- /dev/null
+++ b/include/sys/zfs_chksum.h
@@ -0,0 +1,48 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021 Tino Reichardt <[email protected]>
+ */
+
+#ifndef _ZFS_CHKSUM_H
+#define _ZFS_CHKSUM_H
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#else
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Benchmark the chksums of ZFS when the module is loading */
+void chksum_init(void);
+void chksum_fini(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ZFS_CHKSUM_H */
diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h
index 4fb15636e..945221796 100644
--- a/include/sys/zfs_ioctl.h
+++ b/include/sys/zfs_ioctl.h
@@ -124,6 +124,7 @@ typedef enum drr_headertype {
* default use of "zfs send" won't encounter the bug mentioned above.
*/
#define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27)
+#define DMU_BACKUP_FEATURE_BLAKE3 (1 << 28)
/*
* Mask of all supported backup features
@@ -134,7 +135,7 @@ typedef enum drr_headertype {
DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \
DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \
DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \
- DMU_BACKUP_FEATURE_ZSTD)
+ DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_BLAKE3)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 7b78f0878..4b624165f 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -89,6 +89,7 @@ enum zio_checksum {
ZIO_CHECKSUM_SHA512,
ZIO_CHECKSUM_SKEIN,
ZIO_CHECKSUM_EDONR,
+ ZIO_CHECKSUM_BLAKE3,
ZIO_CHECKSUM_FUNCTIONS
};
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index 9a73a6262..a2ce50816 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -21,7 +21,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2014, 2016 by Delphix. All rights reserved.
- * Copyright Saso Kiselkov 2013, All rights reserved.
+ * Copyright (c) 2013 Saso Kiselkov, All rights reserved.
+ * Copyright (c) 2021 Tino Reichardt <[email protected]>
*/
#ifndef _SYS_ZIO_CHECKSUM_H
@@ -107,6 +108,8 @@ _SYS_ZIO_CHECKSUM_H zio_checksum_info_t
/*
* Checksum routines.
*/
+
+/* SHA2 */
extern zio_checksum_t abd_checksum_SHA256;
extern zio_checksum_t abd_checksum_SHA512_native;
extern zio_checksum_t abd_checksum_SHA512_byteswap;
@@ -123,6 +126,13 @@ extern zio_checksum_t abd_checksum_edonr_byteswap;
extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init;
extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free;
+/* BLAKE3 */
+extern zio_checksum_t abd_checksum_blake3_native;
+extern zio_checksum_t abd_checksum_blake3_byteswap;
+extern zio_checksum_tmpl_init_t abd_checksum_blake3_tmpl_init;
+extern zio_checksum_tmpl_free_t abd_checksum_blake3_tmpl_free;
+
+/* Fletcher 4 */
_SYS_ZIO_CHECKSUM_H zio_abd_checksum_func_t fletcher_4_abd_ops;
extern zio_checksum_t abd_fletcher_4_native;
extern zio_checksum_t abd_fletcher_4_byteswap;
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index d4d636f9c..d98345fe6 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -77,6 +77,7 @@ typedef enum spa_feature {
SPA_FEATURE_DRAID,
SPA_FEATURE_ZILSAXATTR,
SPA_FEATURE_HEAD_ERRLOG,
+ SPA_FEATURE_BLAKE3,
SPA_FEATURES
} spa_feature_t;
diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am
index 304f49e39..b7f1d0e1b 100644
--- a/lib/libicp/Makefile.am
+++ b/lib/libicp/Makefile.am
@@ -13,6 +13,10 @@ nodist_libicp_la_SOURCES = \
module/icp/algs/aes/aes_impl_x86-64.c \
module/icp/algs/aes/aes_impl.c \
module/icp/algs/aes/aes_modes.c \
+ module/icp/algs/blake3/blake3.c \
+ module/icp/algs/blake3/blake3_generic.c \
+ module/icp/algs/blake3/blake3_impl.c \
+ module/icp/algs/blake3/blake3_x86-64.c \
module/icp/algs/edonr/edonr.c \
module/icp/algs/modes/modes.c \
module/icp/algs/modes/cbc.c \
@@ -36,15 +40,30 @@ nodist_libicp_la_SOURCES = \
module/icp/core/kcf_mech_tabs.c \
module/icp/core/kcf_prov_tabs.c
-if TARGET_CPU_X86_64
+if TARGET_CPU_AARCH64
+nodist_libicp_la_SOURCES += \
+ module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \
+ module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+endif
+
+if TARGET_CPU_POWERPC
nodist_libicp_la_SOURCES += \
- module/icp/asm-x86_64/aes/aeskey.c
+ module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \
+ module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
+endif
+
+if TARGET_CPU_X86_64
nodist_libicp_la_SOURCES += \
+ module/icp/asm-x86_64/aes/aeskey.c \
module/icp/asm-x86_64/aes/aes_amd64.S \
module/icp/asm-x86_64/aes/aes_aesni.S \
module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \
module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \
module/icp/asm-x86_64/modes/ghash-x86_64.S \
module/icp/asm-x86_64/sha2/sha256_impl.S \
- module/icp/asm-x86_64/sha2/sha512_impl.S
+ module/icp/asm-x86_64/sha2/sha512_impl.S \
+ module/icp/asm-x86_64/blake3/blake3_avx2.S \
+ module/icp/asm-x86_64/blake3/blake3_avx512.S \
+ module/icp/asm-x86_64/blake3/blake3_sse2.S \
+ module/icp/asm-x86_64/blake3/blake3_sse41.S
endif
diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h
index 6ef836c16..6a6d8b7c6 100644
--- a/lib/libspl/include/sys/simd.h
+++ b/lib/libspl/include/sys/simd.h
@@ -491,6 +491,24 @@ zfs_altivec_available(void)
#endif
return (has_altivec);
}
+static inline boolean_t
+zfs_vsx_available(void)
+{
+ boolean_t has_vsx = B_FALSE;
+#if defined(__ALTIVEC__) && !defined(__FreeBSD__)
+ sighandler_t savesig;
+ savesig = signal(SIGILL, sigillhandler);
+ if (setjmp(env)) {
+ signal(SIGILL, savesig);
+ has_vsx = B_FALSE;
+ } else {
+ __asm__ __volatile__("xssubsp 0,0,0\n");
+ signal(SIGILL, savesig);
+ has_vsx = B_TRUE;
+ }
+#endif
+ return (has_vsx);
+}
#else
#define kfpu_allowed() 0
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi
index 8a71da951..9f9a2f907 100644
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -583,7 +583,7 @@
<elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
- <elf-symbol name='spa_feature_table' size='2016' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+ <elf-symbol name='spa_feature_table' size='2072' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
<elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -4770,8 +4770,8 @@
</function-decl>
</abi-instr>
<abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
- <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='16128' id='9d5e9e2e'>
- <subrange length='36' type-id='7359adad' id='ae666bde'/>
+ <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='16576' id='9d5e9e2e'>
+ <subrange length='37' type-id='7359adad' id='ae666bde'/>
</array-type-def>
<enum-decl name='spa_feature' id='33ecb627'>
<underlying-type type-id='9cac1fee'/>
@@ -4812,7 +4812,8 @@
<enumerator name='SPA_FEATURE_DRAID' value='33'/>
<enumerator name='SPA_FEATURE_ZILSAXATTR' value='34'/>
<enumerator name='SPA_FEATURE_HEAD_ERRLOG' value='35'/>
- <enumerator name='SPA_FEATURES' value='36'/>
+ <enumerator name='SPA_FEATURE_BLAKE3' value='36'/>
+ <enumerator name='SPA_FEATURES' value='37'/>
</enum-decl>
<typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
<enum-decl name='zfeature_flags' id='6db816a4'>
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 60eb30749..eaa920e56 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -67,6 +67,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/abd.c \
module/zfs/aggsum.c \
module/zfs/arc.c \
+ module/zfs/blake3_zfs.c \
module/zfs/blkptr.c \
module/zfs/bplist.c \
module/zfs/bpobj.c \
@@ -171,6 +172,7 @@ nodist_libzpool_la_SOURCES = \
module/zfs/zcp_synctask.c \
module/zfs/zfeature.c \
module/zfs/zfs_byteswap.c \
+ module/zfs/zfs_chksum.c \
module/zfs/zfs_fm.c \
module/zfs/zfs_fuid.c \
module/zfs/zfs_ratelimit.c \
diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7
index 2694938aa..b1e1ce377 100644
--- a/man/man7/zfsprops.7
+++ b/man/man7/zfsprops.7
@@ -743,7 +743,7 @@ This property is not inherited.
.It Xo
.Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns
.Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns
-.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr
+.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr Ns | Ns Sy blake3
.Xc
Controls the checksum used to verify data integrity.
The default value is
@@ -768,8 +768,9 @@ a recommended practice.
The
.Sy sha512 ,
.Sy skein ,
+.Sy edonr ,
and
-.Sy edonr
+.Sy blake3
checksum algorithms require enabling the appropriate features on the pool.
.Pp
Please see
@@ -984,7 +985,7 @@ mount options.
.It Xo
.Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns
.Sy sha256 Ns Oo , Ns Sy verify Oc Ns | Ns Sy sha512 Ns Oo , Ns Sy verify Oc Ns | Ns Sy skein Ns Oo , Ns Sy verify Oc Ns | Ns
-.Sy edonr , Ns Sy verify
+.Sy edonr , Ns Sy verify Ns | Ns Sy blake3 Ns Oo , Ns Sy verify Oc Ns
.Xc
Configures deduplication for a dataset.
The default value is
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7
index b92109c4a..df9e64701 100644
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -326,6 +326,12 @@ while
.Sy freeing
is non-zero.
.
+.feature org.openzfs blake3 no extensible_dataset
+This feature enables the use of the BLAKE3 hash algorithm for checksum and dedup.
+BLAKE3 is a secure hash algorithm focused on high performance.
+.Pp
+.checksum-spiel blake3
+.
.feature com.delphix bookmarks yes extensible_dataset
This feature enables use of the
.Nm zfs Cm bookmark
@@ -436,6 +442,8 @@ in ZFS, which means that the checksum is pre-seeded with a secret
to be checksummed.
Thus the produced checksums are unique to a given pool,
preventing hash collision attacks on systems with dedup.
+.Pp
+.checksum-spiel edonr
.
.feature com.delphix embedded_data no
This feature improves the performance and compression ratio of
diff --git a/module/Kbuild.in b/module/Kbuild.in
index 11099999f..ed8dc23a9 100644
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -75,6 +75,10 @@ ICP_OBJS := \
algs/aes/aes_impl.o \
algs/aes/aes_impl_generic.o \
algs/aes/aes_modes.o \
+ algs/blake3/blake3.o \
+ algs/blake3/blake3_generic.o \
+ algs/blake3/blake3_impl.o \
+ algs/blake3/blake3_x86-64.o \
algs/edonr/edonr.o \
algs/modes/cbc.o \
algs/modes/ccm.o \
@@ -105,23 +109,44 @@ ICP_OBJS_X86_64 := \
asm-x86_64/aes/aes_aesni.o \
asm-x86_64/aes/aes_amd64.o \
asm-x86_64/aes/aeskey.o \
+ asm-x86_64/blake3/blake3_avx2.o \
+ asm-x86_64/blake3/blake3_avx512.o \
+ asm-x86_64/blake3/blake3_sse2.o \
+ asm-x86_64/blake3/blake3_sse41.o \
asm-x86_64/modes/aesni-gcm-x86_64.o \
asm-x86_64/modes/gcm_pclmulqdq.o \
asm-x86_64/modes/ghash-x86_64.o \
asm-x86_64/sha2/sha256_impl.o \
asm-x86_64/sha2/sha512_impl.o
+
ICP_OBJS_X86 := \
algs/aes/aes_impl_aesni.o \
algs/aes/aes_impl_x86-64.o \
algs/modes/gcm_pclmulqdq.o
+
+ICP_OBJS_ARM64 := \
+ asm-aarch64/blake3/b3_aarch64_sse2.o \
+ asm-aarch64/blake3/b3_aarch64_sse41.o
+
+
+ICP_OBJS_PPC_PPC64 := \
+ asm-ppc64/blake3/b3_ppc64le_sse2.o \
+ asm-ppc64/blake3/b3_ppc64le_sse41.o
+
zfs-objs += $(addprefix icp/,$(ICP_OBJS))
zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86))
zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64))
+zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64))
+zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
+zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64))
+
+$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
+ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include)
-$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : asflags-y += -I$(icp_include)
-$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflags-y += -I$(icp_include)
+$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \
+ $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include)
# Suppress objtool "can't find jump dest instruction at" warnings. They
# are caused by the constants which are defined in the text section of the
@@ -129,6 +154,7 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflag
# utility tries to interpret them as opcodes and obviously fails doing so.
OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y
OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y
+
# Suppress objtool "unsupported stack pointer realignment" warnings. We are
# not using a DRAP register while aligning the stack to a 64 byte boundary.
# See #6950 for the reasoning.
@@ -261,6 +287,7 @@ ZFS_OBJS := \
abd.o \
aggsum.o \
arc.o \
+ blake3_zfs.o \
blkptr.o \
bplist.o \
bpobj.o \
@@ -358,6 +385,7 @@ ZFS_OBJS := \
zcp_synctask.o \
zfeature.o \
zfs_byteswap.o \
+ zfs_chksum.o \
zfs_fm.o \
zfs_fuid.o \
zfs_ioctl.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd
index 61f02152d..589ca60b2 100644
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -10,6 +10,10 @@ INCDIR=${.CURDIR:H}/include
KMOD= openzfs
.PATH: ${SRCDIR}/avl \
+ ${SRCDIR}/icp/algs/blake3 \
+ ${SRCDIR}/icp/asm-aarch64/blake3 \
+ ${SRCDIR}/icp/asm-ppc64/blake3 \
+ ${SRCDIR}/icp/asm-x86_64/blake3 \
${SRCDIR}/lua \
${SRCDIR}/nvpair \
${SRCDIR}/icp/algs/edonr \
@@ -31,6 +35,7 @@ CFLAGS+= -I${INCDIR}/os/freebsd
CFLAGS+= -I${INCDIR}/os/freebsd/spl
CFLAGS+= -I${INCDIR}/os/freebsd/zfs
CFLAGS+= -I${SRCDIR}/zstd/include
+CFLAGS+= -I${SRCDIR}/icp/include
CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \
@@ -38,7 +43,8 @@ CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \
-D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DCOMPAT_FREEBSD11
.if ${MACHINE_ARCH} == "amd64"
-CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3
+CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \
+ -DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL
.endif
.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true"
@@ -73,12 +79,32 @@ CFLAGS+= -DBITS_PER_LONG=64
SRCS= vnode_if.h device_if.h bus_if.h
-# avl
+#avl
SRCS+= avl.c
# icp
SRCS+= edonr.c
+#icp/algs/blake3
+SRCS+= blake3.c \
+ blake3_generic.c \
+ blake3_impl.c \
+ blake3_x86-64.c
+
+#icp/asm-aarch64/blake3
+SRCS+= b3_aarch64_sse2.S \
+ b3_aarch64_sse41.S
+
+#icp/asm-ppc64/blake3
+SRCS+= b3_ppc64le_sse2.S \
+ b3_ppc64le_sse41.S
+
+#icp/asm-x86_64/blake3
+SRCS+= blake3_avx2.S \
+ blake3_avx512.S \
+ blake3_sse2.S \
+ blake3_sse41.S
+
#lua
SRCS+= lapi.c \
lauxlib.c \
@@ -189,6 +215,7 @@ SRCS+= zfeature_common.c \
SRCS+= abd.c \
aggsum.c \
arc.c \
+ blake3_zfs.c \
blkptr.c \
bplist.c \
bpobj.c \
@@ -291,6 +318,7 @@ SRCS+= abd.c \
zcp_synctask.c \
zfeature.c \
zfs_byteswap.c \
+ zfs_chksum.c \
zfs_file_os.c \
zfs_fm.c \
zfs_fuid.c \
@@ -337,8 +365,6 @@ SRCS+= zfs_zstd.c \
zstd_decompress.c \
zstd_decompress_block.c
-
-
beforeinstall:
.if ${MK_DEBUG_FILES} != "no"
mtree -eu \
diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c
new file mode 100644
index 000000000..8c9c06eb9
--- /dev/null
+++ b/module/icp/algs/blake3/blake3.c
@@ -0,0 +1,732 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/blake3.h>
+
+#include "blake3_impl.h"
+
+/*
+ * We need 1056 byte stack for blake3_compress_subtree_wide()
+ * - we define this pragma to make gcc happy
+ */
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+/* internal used */
+typedef struct {
+ uint32_t input_cv[8];
+ uint64_t counter;
+ uint8_t block[BLAKE3_BLOCK_LEN];
+ uint8_t block_len;
+ uint8_t flags;
+} output_t;
+
+/* internal flags */
+enum blake3_flags {
+ CHUNK_START = 1 << 0,
+ CHUNK_END = 1 << 1,
+ PARENT = 1 << 2,
+ ROOT = 1 << 3,
+ KEYED_HASH = 1 << 4,
+ DERIVE_KEY_CONTEXT = 1 << 5,
+ DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+/* internal start */
+static void chunk_state_init(blake3_chunk_state_t *ctx,
+ const uint32_t key[8], uint8_t flags)
+{
+ memcpy(ctx->cv, key, BLAKE3_KEY_LEN);
+ ctx->chunk_counter = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ ctx->buf_len = 0;
+ ctx->blocks_compressed = 0;
+ ctx->flags = flags;
+}
+
+static void chunk_state_reset(blake3_chunk_state_t *ctx,
+ const uint32_t key[8], uint64_t chunk_counter)
+{
+ memcpy(ctx->cv, key, BLAKE3_KEY_LEN);
+ ctx->chunk_counter = chunk_counter;
+ ctx->blocks_compressed = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ ctx->buf_len = 0;
+}
+
+static size_t chunk_state_len(const blake3_chunk_state_t *ctx)
+{
+ return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) +
+ ((size_t)ctx->buf_len);
+}
+
+static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx,
+ const uint8_t *input, size_t input_len)
+{
+ size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len);
+ if (take > input_len) {
+ take = input_len;
+ }
+ uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len);
+ memcpy(dest, input, take);
+ ctx->buf_len += (uint8_t)take;
+ return (take);
+}
+
+static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx)
+{
+ if (ctx->blocks_compressed == 0) {
+ return (CHUNK_START);
+ } else {
+ return (0);
+ }
+}
+
+static output_t make_output(const uint32_t input_cv[8],
+ const uint8_t *block, uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ output_t ret;
+ memcpy(ret.input_cv, input_cv, 32);
+ memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+ ret.block_len = block_len;
+ ret.counter = counter;
+ ret.flags = flags;
+ return (ret);
+}
+
+/*
+ * Chaining values within a given chunk (specifically the compress_in_place
+ * interface) are represented as words. This avoids unnecessary bytes<->words
+ * conversion overhead in the portable implementation. However, the hash_many
+ * interface handles both user input and parent node blocks, so it accepts
+ * bytes. For that reason, chaining values in the CV stack are represented as
+ * bytes.
+ */
+static void output_chaining_value(const blake3_impl_ops_t *ops,
+ const output_t *ctx, uint8_t cv[32])
+{
+ uint32_t cv_words[8];
+ memcpy(cv_words, ctx->input_cv, 32);
+ ops->compress_in_place(cv_words, ctx->block, ctx->block_len,
+ ctx->counter, ctx->flags);
+ store_cv_words(cv, cv_words);
+}
+
+static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx,
+ uint64_t seek, uint8_t *out, size_t out_len)
+{
+ uint64_t output_block_counter = seek / 64;
+ size_t offset_within_block = seek % 64;
+ uint8_t wide_buf[64];
+ while (out_len > 0) {
+ ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len,
+ output_block_counter, ctx->flags | ROOT, wide_buf);
+ size_t available_bytes = 64 - offset_within_block;
+ size_t memcpy_len;
+ if (out_len > available_bytes) {
+ memcpy_len = available_bytes;
+ } else {
+ memcpy_len = out_len;
+ }
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
+ out += memcpy_len;
+ out_len -= memcpy_len;
+ output_block_counter += 1;
+ offset_within_block = 0;
+ }
+}
+
+static void chunk_state_update(const blake3_impl_ops_t *ops,
+ blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len)
+{
+ if (ctx->buf_len > 0) {
+ size_t take = chunk_state_fill_buf(ctx, input, input_len);
+ input += take;
+ input_len -= take;
+ if (input_len > 0) {
+ ops->compress_in_place(ctx->cv, ctx->buf,
+ BLAKE3_BLOCK_LEN, ctx->chunk_counter,
+ ctx->flags|chunk_state_maybe_start_flag(ctx));
+ ctx->blocks_compressed += 1;
+ ctx->buf_len = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ }
+ }
+
+ while (input_len > BLAKE3_BLOCK_LEN) {
+ ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN,
+ ctx->chunk_counter,
+ ctx->flags|chunk_state_maybe_start_flag(ctx));
+ ctx->blocks_compressed += 1;
+ input += BLAKE3_BLOCK_LEN;
+ input_len -= BLAKE3_BLOCK_LEN;
+ }
+
+ size_t take = chunk_state_fill_buf(ctx, input, input_len);
+ input += take;
+ input_len -= take;
+}
+
+static output_t chunk_state_output(const blake3_chunk_state_t *ctx)
+{
+ uint8_t block_flags =
+ ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END;
+ return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter,
+ block_flags));
+}
+
+static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+ const uint32_t key[8], uint8_t flags)
+{
+ return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT));
+}
+
+/*
+ * Given some input larger than one chunk, return the number of bytes that
+ * should go in the left subtree. This is the largest power-of-2 number of
+ * chunks that leaves at least 1 byte for the right subtree.
+ */
+static size_t left_len(size_t content_len)
+{
+ /*
+ * Subtract 1 to reserve at least one byte for the right side.
+ * content_len
+ * should always be greater than BLAKE3_CHUNK_LEN.
+ */
+ size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+ return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN);
+}
+
+/*
+ * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+ * on a single thread. Write out the chunk chaining values and return the
+ * number of chunks hashed. These chunks are never the root and never empty;
+ * those cases use a different codepath.
+ */
+static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+ const uint8_t *chunks_array[MAX_SIMD_DEGREE];
+ size_t input_position = 0;
+ size_t chunks_array_len = 0;
+ while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+ chunks_array[chunks_array_len] = &input[input_position];
+ input_position += BLAKE3_CHUNK_LEN;
+ chunks_array_len += 1;
+ }
+
+ ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN /
+ BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START,
+ CHUNK_END, out);
+
+ /*
+ * Hash the remaining partial chunk, if there is one. Note that the
+ * empty chunk (meaning the empty message) is a different codepath.
+ */
+ if (input_len > input_position) {
+ uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+ blake3_chunk_state_t chunk_state;
+ chunk_state_init(&chunk_state, key, flags);
+ chunk_state.chunk_counter = counter;
+ chunk_state_update(ops, &chunk_state, &input[input_position],
+ input_len - input_position);
+ output_t output = chunk_state_output(&chunk_state);
+ output_chaining_value(ops, &output, &out[chunks_array_len *
+ BLAKE3_OUT_LEN]);
+ return (chunks_array_len + 1);
+ } else {
+ return (chunks_array_len);
+ }
+}
+
+/*
+ * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+ * on a single thread. Write out the parent chaining values and return the
+ * number of parents hashed. (If there's an odd input chaining value left over,
+ * return it as an additional output.) These parents are never the root and
+ * never empty; those cases use a different codepath.
+ */
+static size_t compress_parents_parallel(const blake3_impl_ops_t *ops,
+ const uint8_t *child_chaining_values, size_t num_chaining_values,
+ const uint32_t key[8], uint8_t flags, uint8_t *out)
+{
+ const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
+ size_t parents_array_len = 0;
+
+ while (num_chaining_values - (2 * parents_array_len) >= 2) {
+ parents_array[parents_array_len] = &child_chaining_values[2 *
+ parents_array_len * BLAKE3_OUT_LEN];
+ parents_array_len += 1;
+ }
+
+ ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE,
+ flags | PARENT, 0, 0, out);
+
+ /* If there's an odd child left over, it becomes an output. */
+ if (num_chaining_values > 2 * parents_array_len) {
+ memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+ &child_chaining_values[2 * parents_array_len *
+ BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
+ return (parents_array_len + 1);
+ } else {
+ return (parents_array_len);
+ }
+}
+
+/*
+ * The wide helper function returns (writes out) an array of chaining values
+ * and returns the length of that array. The number of chaining values returned
+ * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+ * if the input is shorter than that many chunks. The reason for maintaining a
+ * wide array of chaining values going back up the tree, is to allow the
+ * implementation to hash as many parents in parallel as possible.
+ *
+ * As a special case when the SIMD degree is 1, this function will still return
+ * at least 2 outputs. This guarantees that this function doesn't perform the
+ * root compression. (If it did, it would use the wrong flags, and also we
+ * wouldn't be able to implement exendable ouput.) Note that this function is
+ * not used when the whole input is only 1 chunk long; that's a different
+ * codepath.
+ *
+ * Why not just have the caller split the input on the first update(), instead
+ * of implementing this special rule? Because we don't want to limit SIMD or
+ * multi-threading parallelism for that update().
+ */
+static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+ /*
+ * Note that the single chunk case does *not* bump the SIMD degree up
+ * to 2 when it is 1. If this implementation adds multi-threading in
+ * the future, this gives us the option of multi-threading even the
+ * 2-chunk case, which can help performance on smaller platforms.
+ */
+ if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) {
+ return (compress_chunks_parallel(ops, input, input_len, key,
+ chunk_counter, flags, out));
+ }
+
+
+ /*
+ * With more than simd_degree chunks, we need to recurse. Start by
+ * dividing the input into left and right subtrees. (Note that this is
+ * only optimal as long as the SIMD degree is a power of 2. If we ever
+ * get a SIMD degree of 3 or something, we'll need a more complicated
+ * strategy.)
+ */
+ size_t left_input_len = left_len(input_len);
+ size_t right_input_len = input_len - left_input_len;
+ const uint8_t *right_input = &input[left_input_len];
+ uint64_t right_chunk_counter = chunk_counter +
+ (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+ /*
+ * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2
+ * to account for the special case of returning 2 outputs when the
+ * SIMD degree is 1.
+ */
+ uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t degree = ops->degree;
+ if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+
+ /*
+ * The special case: We always use a degree of at least two,
+ * to make sure there are two outputs. Except, as noted above,
+ * at the chunk level, where we allow degree=1. (Note that the
+ * 1-chunk-input case is a different codepath.)
+ */
+ degree = 2;
+ }
+ uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+ /*
+ * Recurse! If this implementation adds multi-threading support in the
+ * future, this is where it will go.
+ */
+ size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len,
+ key, chunk_counter, flags, cv_array);
+ size_t right_n = blake3_compress_subtree_wide(ops, right_input,
+ right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+ /*
+ * The special case again. If simd_degree=1, then we'll have left_n=1
+ * and right_n=1. Rather than compressing them into a single output,
+ * return them directly, to make sure we always have at least two
+ * outputs.
+ */
+ if (left_n == 1) {
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+ return (2);
+ }
+
+ /* Otherwise, do one layer of parent node compression. */
+ size_t num_chaining_values = left_n + right_n;
+ return compress_parents_parallel(ops, cv_array,
+ num_chaining_values, key, flags, out);
+}
+
+/*
+ * Hash a subtree with compress_subtree_wide(), and then condense the resulting
+ * list of chaining values down to a single parent node. Don't compress that
+ * last parent node, however. Instead, return its message bytes (the
+ * concatenated chaining values of its children). This is necessary when the
+ * first call to update() supplies a complete subtree, because the topmost
+ * parent node of that subtree could end up being the root. It's also necessary
+ * for extended output in the general case.
+ *
+ * As with compress_subtree_wide(), this function is not used on inputs of 1
+ * chunk or less. That's a different codepath.
+ */
+static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
+{
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len,
+ key, chunk_counter, flags, cv_array);
+
+ /*
+ * If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+ * compress_subtree_wide() returns more than 2 chaining values. Condense
+ * them into 2 by forming parent nodes repeatedly.
+ */
+ uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+ while (num_cvs > 2) {
+ num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key,
+ flags, out_array);
+ memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+ }
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8],
+ uint8_t flags)
+{
+ memcpy(ctx->key, key, BLAKE3_KEY_LEN);
+ chunk_state_init(&ctx->chunk, key, flags);
+ ctx->cv_stack_len = 0;
+ ctx->ops = blake3_impl_get_ops();
+}
+
+/*
+ * As described in hasher_push_cv() below, we do "lazy merging", delaying
+ * merges until right before the next CV is about to be added. This is
+ * different from the reference implementation. Another difference is that we
+ * aren't always merging 1 chunk at a time. Instead, each CV might represent
+ * any power-of-two number of chunks, as long as the smaller-above-larger
+ * stack order is maintained. Instead of the "count the trailing 0-bits"
+ * algorithm described in the spec, we use a "count the total number of
+ * 1-bits" variant that doesn't require us to retain the subtree size of the
+ * CV on top of the stack. The principle is the same: each CV that should
+ * remain in the stack is represented by a 1-bit in the total number of chunks
+ * (or bytes) so far.
+ */
+static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len)
+{
+ size_t post_merge_stack_len = (size_t)popcnt(total_len);
+ while (ctx->cv_stack_len > post_merge_stack_len) {
+ uint8_t *parent_node =
+ &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+ output_t output =
+ parent_output(parent_node, ctx->key, ctx->chunk.flags);
+ output_chaining_value(ctx->ops, &output, parent_node);
+ ctx->cv_stack_len -= 1;
+ }
+}
+
+/*
+ * In reference_impl.rs, we merge the new CV with existing CVs from the stack
+ * before pushing it. We can do that because we know more input is coming, so
+ * we know none of the merges are root.
+ *
+ * This setting is different. We want to feed as much input as possible to
+ * compress_subtree_wide(), without setting aside anything for the chunk_state.
+ * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+ * as a single subtree, if at all possible.
+ *
+ * This leads to two problems:
+ * 1) This 64 KiB input might be the only call that ever gets made to update.
+ * In this case, the root node of the 64 KiB subtree would be the root node
+ * of the whole tree, and it would need to be ROOT finalized. We can't
+ * compress it until we know.
+ * 2) This 64 KiB input might complete a larger tree, whose root node is
+ * similarly going to be the the root of the whole tree. For example, maybe
+ * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+ * node at the root of the 256 KiB subtree until we know how to finalize it.
+ *
+ * The second problem is solved with "lazy merging". That is, when we're about
+ * to add a CV to the stack, we don't merge it with anything first, as the
+ * reference impl does. Instead we do merges using the *previous* CV that was
+ * added, which is sitting on top of the stack, and we put the new CV
+ * (unmerged) on top of the stack afterwards. This guarantees that we never
+ * merge the root node until finalize().
+ *
+ * Solving the first problem requires an additional tool,
+ * compress_subtree_to_parent_node(). That function always returns the top
+ * *two* chaining values of the subtree it's compressing. We then do lazy
+ * merging with each of them separately, so that the second CV will always
+ * remain unmerged. (That also helps us support extendable output when we're
+ * hashing an input all-at-once.)
+ */
+static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN],
+ uint64_t chunk_counter)
+{
+ hasher_merge_cv_stack(ctx, chunk_counter);
+ memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
+ BLAKE3_OUT_LEN);
+ ctx->cv_stack_len += 1;
+}
+
+void
+Blake3_Init(BLAKE3_CTX *ctx)
+{
+ hasher_init_base(ctx, BLAKE3_IV, 0);
+}
+
+void
+Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN])
+{
+ uint32_t key_words[8];
+ load_key_words(key, key_words);
+ hasher_init_base(ctx, key_words, KEYED_HASH);
+}
+
+static void
+Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len)
+{
+ /*
+ * Explicitly checking for zero avoids causing UB by passing a null
+ * pointer to memcpy. This comes up in practice with things like:
+ * std::vector<uint8_t> v;
+ * blake3_hasher_update(&hasher, v.data(), v.size());
+ */
+ if (input_len == 0) {
+ return;
+ }
+
+ const uint8_t *input_bytes = (const uint8_t *)input;
+
+ /*
+ * If we have some partial chunk bytes in the internal chunk_state, we
+ * need to finish that chunk first.
+ */
+ if (chunk_state_len(&ctx->chunk) > 0) {
+ size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk);
+ if (take > input_len) {
+ take = input_len;
+ }
+ chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take);
+ input_bytes += take;
+ input_len -= take;
+ /*
+ * If we've filled the current chunk and there's more coming,
+ * finalize this chunk and proceed. In this case we know it's
+ * not the root.
+ */
+ if (input_len > 0) {
+ output_t output = chunk_state_output(&ctx->chunk);
+ uint8_t chunk_cv[32];
+ output_chaining_value(ctx->ops, &output, chunk_cv);
+ hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter);
+ chunk_state_reset(&ctx->chunk, ctx->key,
+ ctx->chunk.chunk_counter + 1);
+ } else {
+ return;
+ }
+ }
+
+ /*
+ * Now the chunk_state is clear, and we have more input. If there's
+ * more than a single chunk (so, definitely not the root chunk), hash
+ * the largest whole subtree we can, with the full benefits of SIMD
+ * (and maybe in the future, multi-threading) parallelism. Two
+ * restrictions:
+ * - The subtree has to be a power-of-2 number of chunks. Only
+ * subtrees along the right edge can be incomplete, and we don't know
+ * where the right edge is going to be until we get to finalize().
+ * - The subtree must evenly divide the total number of chunks up
+ * until this point (if total is not 0). If the current incomplete
+ * subtree is only waiting for 1 more chunk, we can't hash a subtree
+ * of 4 chunks. We have to complete the current subtree first.
+ * Because we might need to break up the input to form powers of 2, or
+ * to evenly divide what we already have, this part runs in a loop.
+ */
+ while (input_len > BLAKE3_CHUNK_LEN) {
+ size_t subtree_len = round_down_to_power_of_2(input_len);
+ uint64_t count_so_far =
+ ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+ /*
+ * Shrink the subtree_len until it evenly divides the count so
+ * far. We know that subtree_len itself is a power of 2, so we
+ * can use a bitmasking trick instead of an actual remainder
+ * operation. (Note that if the caller consistently passes
+ * power-of-2 inputs of the same size, as is hopefully
+ * typical, this loop condition will always fail, and
+ * subtree_len will always be the full length of the input.)
+ *
+ * An aside: We don't have to shrink subtree_len quite this
+ * much. For example, if count_so_far is 1, we could pass 2
+ * chunks to compress_subtree_to_parent_node. Since we'll get
+ * 2 CVs back, we'll still get the right answer in the end,
+ * and we might get to use 2-way SIMD parallelism. The problem
+ * with this optimization, is that it gets us stuck always
+ * hashing 2 chunks. The total number of chunks will remain
+ * odd, and we'll never graduate to higher degrees of
+ * parallelism. See
+ * https://github.com/BLAKE3-team/BLAKE3/issues/69.
+ */
+ while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+ subtree_len /= 2;
+ }
+ /*
+ * The shrunken subtree_len might now be 1 chunk long. If so,
+ * hash that one chunk by itself. Otherwise, compress the
+ * subtree into a pair of CVs.
+ */
+ uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+ if (subtree_len <= BLAKE3_CHUNK_LEN) {
+ blake3_chunk_state_t chunk_state;
+ chunk_state_init(&chunk_state, ctx->key,
+ ctx->chunk.flags);
+ chunk_state.chunk_counter = ctx->chunk.chunk_counter;
+ chunk_state_update(ctx->ops, &chunk_state, input_bytes,
+ subtree_len);
+ output_t output = chunk_state_output(&chunk_state);
+ uint8_t cv[BLAKE3_OUT_LEN];
+ output_chaining_value(ctx->ops, &output, cv);
+ hasher_push_cv(ctx, cv, chunk_state.chunk_counter);
+ } else {
+ /*
+ * This is the high-performance happy path, though
+ * getting here depends on the caller giving us a long
+ * enough input.
+ */
+ uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+ compress_subtree_to_parent_node(ctx->ops, input_bytes,
+ subtree_len, ctx->key, ctx-> chunk.chunk_counter,
+ ctx->chunk.flags, cv_pair);
+ hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter);
+ hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN],
+ ctx->chunk.chunk_counter + (subtree_chunks / 2));
+ }
+ ctx->chunk.chunk_counter += subtree_chunks;
+ input_bytes += subtree_len;
+ input_len -= subtree_len;
+ }
+
+ /*
+ * If there's any remaining input less than a full chunk, add it to
+ * the chunk state. In that case, also do a final merge loop to make
+ * sure the subtree stack doesn't contain any unmerged pairs. The
+ * remaining input means we know these merges are non-root. This merge
+ * loop isn't strictly necessary here, because hasher_push_chunk_cv
+ * already does its own merge loop, but it simplifies
+ * blake3_hasher_finalize below.
+ */
+ if (input_len > 0) {
+ chunk_state_update(ctx->ops, &ctx->chunk, input_bytes,
+ input_len);
+ hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter);
+ }
+}
+
+void
+Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo)
+{
+ size_t done = 0;
+ const uint8_t *data = input;
+ const size_t block_max = 1024 * 64;
+
+ /* max feed buffer to leave the stack size small */
+ while (todo != 0) {
+ size_t block = (todo >= block_max) ? block_max : todo;
+ Blake3_Update2(ctx, data + done, block);
+ done += block;
+ todo -= block;
+ }
+}
+
+void
+Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out)
+{
+ Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN);
+}
+
+void
+Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out,
+ size_t out_len)
+{
+ /*
+ * Explicitly checking for zero avoids causing UB by passing a null
+ * pointer to memcpy. This comes up in practice with things like:
+ * std::vector<uint8_t> v;
+ * blake3_hasher_finalize(&hasher, v.data(), v.size());
+ */
+ if (out_len == 0) {
+ return;
+ }
+ /* If the subtree stack is empty, then the current chunk is the root. */
+ if (ctx->cv_stack_len == 0) {
+ output_t output = chunk_state_output(&ctx->chunk);
+ output_root_bytes(ctx->ops, &output, seek, out, out_len);
+ return;
+ }
+ /*
+ * If there are any bytes in the chunk state, finalize that chunk and
+ * do a roll-up merge between that chunk hash and every subtree in the
+ * stack. In this case, the extra merge loop at the end of
+ * blake3_hasher_update guarantees that none of the subtrees in the
+ * stack need to be merged with each other first. Otherwise, if there
+ * are no bytes in the chunk state, then the top of the stack is a
+ * chunk hash, and we start the merge from that.
+ */
+ output_t output;
+ size_t cvs_remaining;
+ if (chunk_state_len(&ctx->chunk) > 0) {
+ cvs_remaining = ctx->cv_stack_len;
+ output = chunk_state_output(&ctx->chunk);
+ } else {
+ /* There are always at least 2 CVs in the stack in this case. */
+ cvs_remaining = ctx->cv_stack_len - 2;
+ output = parent_output(&ctx->cv_stack[cvs_remaining * 32],
+ ctx->key, ctx->chunk.flags);
+ }
+ while (cvs_remaining > 0) {
+ cvs_remaining -= 1;
+ uint8_t parent_block[BLAKE3_BLOCK_LEN];
+ memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32);
+ output_chaining_value(ctx->ops, &output, &parent_block[32]);
+ output = parent_output(parent_block, ctx->key,
+ ctx->chunk.flags);
+ }
+ output_root_bytes(ctx->ops, &output, seek, out, out_len);
+}
diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c
new file mode 100644
index 000000000..6ff9a845c
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_generic.c
@@ -0,0 +1,202 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include "blake3_impl.h"
+
+#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+ uint32_t x, uint32_t y)
+{
+ state[a] = state[a] + state[b] + x;
+ state[d] = rotr32(state[d] ^ state[a], 16);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 12);
+ state[a] = state[a] + state[b] + y;
+ state[d] = rotr32(state[d] ^ state[a], 8);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+static inline void round_fn(uint32_t state[16], const uint32_t *msg,
+ size_t round)
+{
+ /* Select the message schedule based on the round. */
+ const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round];
+
+ /* Mix the columns. */
+ g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+ g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+ g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+ g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+ /* Mix the rows. */
+ g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+ g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+ g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+ g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+static inline void compress_pre(uint32_t state[16], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ uint32_t block_words[16];
+ block_words[0] = load32(block + 4 * 0);
+ block_words[1] = load32(block + 4 * 1);
+ block_words[2] = load32(block + 4 * 2);
+ block_words[3] = load32(block + 4 * 3);
+ block_words[4] = load32(block + 4 * 4);
+ block_words[5] = load32(block + 4 * 5);
+ block_words[6] = load32(block + 4 * 6);
+ block_words[7] = load32(block + 4 * 7);
+ block_words[8] = load32(block + 4 * 8);
+ block_words[9] = load32(block + 4 * 9);
+ block_words[10] = load32(block + 4 * 10);
+ block_words[11] = load32(block + 4 * 11);
+ block_words[12] = load32(block + 4 * 12);
+ block_words[13] = load32(block + 4 * 13);
+ block_words[14] = load32(block + 4 * 14);
+ block_words[15] = load32(block + 4 * 15);
+
+ state[0] = cv[0];
+ state[1] = cv[1];
+ state[2] = cv[2];
+ state[3] = cv[3];
+ state[4] = cv[4];
+ state[5] = cv[5];
+ state[6] = cv[6];
+ state[7] = cv[7];
+ state[8] = BLAKE3_IV[0];
+ state[9] = BLAKE3_IV[1];
+ state[10] = BLAKE3_IV[2];
+ state[11] = BLAKE3_IV[3];
+ state[12] = counter_low(counter);
+ state[13] = counter_high(counter);
+ state[14] = (uint32_t)block_len;
+ state[15] = (uint32_t)flags;
+
+ round_fn(state, &block_words[0], 0);
+ round_fn(state, &block_words[0], 1);
+ round_fn(state, &block_words[0], 2);
+ round_fn(state, &block_words[0], 3);
+ round_fn(state, &block_words[0], 4);
+ round_fn(state, &block_words[0], 5);
+ round_fn(state, &block_words[0], 6);
+}
+
+static inline void blake3_compress_in_place_generic(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+ cv[0] = state[0] ^ state[8];
+ cv[1] = state[1] ^ state[9];
+ cv[2] = state[2] ^ state[10];
+ cv[3] = state[3] ^ state[11];
+ cv[4] = state[4] ^ state[12];
+ cv[5] = state[5] ^ state[13];
+ cv[6] = state[6] ^ state[14];
+ cv[7] = state[7] ^ state[15];
+}
+
+static inline void hash_one_generic(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
+{
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN,
+ counter, block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ store_cv_words(out, cv);
+}
+
+static inline void blake3_compress_xof_generic(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64])
+{
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+
+ store32(&out[0 * 4], state[0] ^ state[8]);
+ store32(&out[1 * 4], state[1] ^ state[9]);
+ store32(&out[2 * 4], state[2] ^ state[10]);
+ store32(&out[3 * 4], state[3] ^ state[11]);
+ store32(&out[4 * 4], state[4] ^ state[12]);
+ store32(&out[5 * 4], state[5] ^ state[13]);
+ store32(&out[6 * 4], state[6] ^ state[14]);
+ store32(&out[7 * 4], state[7] ^ state[15]);
+ store32(&out[8 * 4], state[8] ^ cv[0]);
+ store32(&out[9 * 4], state[9] ^ cv[1]);
+ store32(&out[10 * 4], state[10] ^ cv[2]);
+ store32(&out[11 * 4], state[11] ^ cv[3]);
+ store32(&out[12 * 4], state[12] ^ cv[4]);
+ store32(&out[13 * 4], state[13] ^ cv[5]);
+ store32(&out[14 * 4], state[14] ^ cv[6]);
+ store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+static inline void blake3_hash_many_generic(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter,
+ boolean_t increment_counter, uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out)
+{
+ while (num_inputs > 0) {
+ hash_one_generic(inputs[0], blocks, key, counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
+
+static inline boolean_t blake3_is_generic_supported(void)
+{
+ return (B_TRUE);
+}
+
+const blake3_impl_ops_t blake3_generic_impl = {
+ .compress_in_place = blake3_compress_in_place_generic,
+ .compress_xof = blake3_compress_xof_generic,
+ .hash_many = blake3_hash_many_generic,
+ .is_supported = blake3_is_generic_supported,
+ .degree = 4,
+ .name = "generic"
+};
diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c
new file mode 100644
index 000000000..c3268ec13
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_impl.c
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+
+#include "blake3_impl.h"
+
+static const blake3_impl_ops_t *const blake3_impls[] = {
+ &blake3_generic_impl,
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ &blake3_sse2_impl,
+#endif
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ &blake3_sse41_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+ &blake3_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+ &blake3_avx512_impl,
+#endif
+};
+
+/* this pointer holds current ops for implementation */
+static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl;
+
+/* special implementation selections */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX-1)
+#define IMPL_USER (UINT32_MAX-2)
+#define IMPL_PARAM (UINT32_MAX-3)
+
+#define IMPL_READ(i) (*(volatile uint32_t *) &(i))
+static uint32_t icp_blake3_impl = IMPL_FASTEST;
+
+#define BLAKE3_IMPL_NAME_MAX 16
+
+/* id of fastest implementation */
+static uint32_t blake3_fastest_id = 0;
+
+/* currently used id */
+static uint32_t blake3_current_id = 0;
+
+/* id of module parameter (-1 == unused) */
+static int blake3_param_id = -1;
+
+/* return number of supported implementations */
+int
+blake3_get_impl_count(void)
+{
+ static int impls = 0;
+ int i;
+
+ if (impls)
+ return (impls);
+
+ for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ impls++;
+ }
+
+ return (impls);
+}
+
+/* return id of selected implementation */
+int
+blake3_get_impl_id(void)
+{
+ return (blake3_current_id);
+}
+
+/* return name of selected implementation */
+const char *
+blake3_get_impl_name(void)
+{
+ return (blake3_selected_impl->name);
+}
+
+/* setup id as fastest implementation */
+void
+blake3_set_impl_fastest(uint32_t id)
+{
+ blake3_fastest_id = id;
+}
+
+/* set implementation by id */
+void
+blake3_set_impl_id(uint32_t id)
+{
+ int i, cid;
+
+ /* select fastest */
+ if (id == IMPL_FASTEST)
+ id = blake3_fastest_id;
+
+ /* select next or first */
+ if (id == IMPL_CYCLE)
+ id = (++blake3_current_id) % blake3_get_impl_count();
+
+ /* 0..N for the real impl */
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ if (cid == id) {
+ blake3_current_id = cid;
+ blake3_selected_impl = blake3_impls[i];
+ return;
+ }
+ cid++;
+ }
+}
+
+/* set implementation by name */
+int
+blake3_set_impl_name(const char *name)
+{
+ int i, cid;
+
+ if (strcmp(name, "fastest") == 0) {
+ atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST);
+ blake3_set_impl_id(IMPL_FASTEST);
+ return (0);
+ } else if (strcmp(name, "cycle") == 0) {
+ atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE);
+ blake3_set_impl_id(IMPL_CYCLE);
+ return (0);
+ }
+
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ if (strcmp(name, blake3_impls[i]->name) == 0) {
+ if (icp_blake3_impl == IMPL_PARAM) {
+ blake3_param_id = cid;
+ return (0);
+ }
+ blake3_selected_impl = blake3_impls[i];
+ blake3_current_id = cid;
+ return (0);
+ }
+ cid++;
+ }
+
+ return (-EINVAL);
+}
+
+/* setup implementation */
+void
+blake3_setup_impl(void)
+{
+ switch (IMPL_READ(icp_blake3_impl)) {
+ case IMPL_PARAM:
+ blake3_set_impl_id(blake3_param_id);
+ atomic_swap_32(&icp_blake3_impl, IMPL_USER);
+ break;
+ case IMPL_FASTEST:
+ blake3_set_impl_id(IMPL_FASTEST);
+ break;
+ case IMPL_CYCLE:
+ blake3_set_impl_id(IMPL_CYCLE);
+ break;
+ default:
+ blake3_set_impl_id(blake3_current_id);
+ break;
+ }
+}
+
+/* return selected implementation */
+const blake3_impl_ops_t *
+blake3_impl_get_ops(void)
+{
+ /* each call to ops will cycle */
+ if (icp_blake3_impl == IMPL_CYCLE)
+ blake3_set_impl_id(IMPL_CYCLE);
+
+ return (blake3_selected_impl);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+static int
+icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp)
+{
+ char req_name[BLAKE3_IMPL_NAME_MAX];
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(name, BLAKE3_IMPL_NAME_MAX);
+ if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX)
+ return (-EINVAL);
+
+ strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX);
+ while (i > 0 && isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ atomic_swap_32(&icp_blake3_impl, IMPL_PARAM);
+ return (blake3_set_impl_name(req_name));
+}
+
+static int
+icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cid, cnt = 0;
+ char *fmt;
+
+ /* cycling */
+ fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle ";
+ cnt += sprintf(buffer + cnt, fmt);
+
+ /* fastest one */
+ fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest ";
+ cnt += sprintf(buffer + cnt, fmt);
+
+ /* user selected */
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ fmt = (icp_blake3_impl == IMPL_USER &&
+ cid == blake3_current_id) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name);
+ cid++;
+ }
+
+ buffer[cnt] = 0;
+
+ return (cnt);
+}
+
+module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get,
+ NULL, 0644);
+MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation.");
+#endif
diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h
new file mode 100644
index 000000000..7b40cc4d3
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_impl.h
@@ -0,0 +1,213 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/blake3.h>
+#include <sys/simd.h>
+
+/*
+ * Methods used to define BLAKE3 assembler implementations
+ */
+typedef void (*blake3_compress_in_place_f)(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+typedef void (*blake3_compress_xof_f)(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+typedef boolean_t (*blake3_is_supported_f)(void);
+
+typedef struct blake3_impl_ops {
+ blake3_compress_in_place_f compress_in_place;
+ blake3_compress_xof_f compress_xof;
+ blake3_hash_many_f hash_many;
+ blake3_is_supported_f is_supported;
+ int degree;
+ const char *name;
+} blake3_impl_ops_t;
+
+/* Return selected BLAKE3 implementation ops */
+extern const blake3_impl_ops_t *blake3_impl_get_ops(void);
+
+extern const blake3_impl_ops_t blake3_generic_impl;
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+extern const blake3_impl_ops_t blake3_sse2_impl;
+#endif
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+extern const blake3_impl_ops_t blake3_sse41_impl;
+#endif
+
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+extern const blake3_impl_ops_t blake3_avx2_impl;
+#endif
+
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+extern const blake3_impl_ops_t blake3_avx512_impl;
+#endif
+
+#if defined(__x86_64)
+#define MAX_SIMD_DEGREE 16
+#else
+#define MAX_SIMD_DEGREE 4
+#endif
+
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t BLAKE3_IV[8] = {
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+static inline unsigned int highest_one(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+ return (63 ^ __builtin_clzll(x));
+#elif defined(_MSC_VER) && defined(IS_X86_64)
+ unsigned long index;
+ _BitScanReverse64(&index, x);
+ return (index);
+#elif defined(_MSC_VER) && defined(IS_X86_32)
+ if (x >> 32) {
+ unsigned long index;
+ _BitScanReverse(&index, x >> 32);
+ return (32 + index);
+ } else {
+ unsigned long index;
+ _BitScanReverse(&index, x);
+ return (index);
+ }
+#else
+ unsigned int c = 0;
+ if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
+ if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
+ if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
+ if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
+ if (x & 0x000000000000000cULL) { x >>= 2; c += 2; }
+ if (x & 0x0000000000000002ULL) { c += 1; }
+ return (c);
+#endif
+}
+
+/* Count the number of 1 bits. */
+static inline unsigned int popcnt(uint64_t x) {
+ unsigned int count = 0;
+
+ while (x != 0) {
+ count += 1;
+ x &= x - 1;
+ }
+
+ return (count);
+}
+
+/*
+ * Largest power of two less than or equal to x.
+ * As a special case, returns 1 when x is 0.
+ */
+static inline uint64_t round_down_to_power_of_2(uint64_t x) {
+ return (1ULL << highest_one(x | 1));
+}
+
+static inline uint32_t counter_low(uint64_t counter) {
+ return ((uint32_t)counter);
+}
+
+static inline uint32_t counter_high(uint64_t counter) {
+ return ((uint32_t)(counter >> 32));
+}
+
+static inline uint32_t load32(const void *src) {
+ const uint8_t *p = (const uint8_t *)src;
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+ uint32_t key_words[8]) {
+ key_words[0] = load32(&key[0 * 4]);
+ key_words[1] = load32(&key[1 * 4]);
+ key_words[2] = load32(&key[2 * 4]);
+ key_words[3] = load32(&key[3 * 4]);
+ key_words[4] = load32(&key[4 * 4]);
+ key_words[5] = load32(&key[5 * 4]);
+ key_words[6] = load32(&key[6 * 4]);
+ key_words[7] = load32(&key[7 * 4]);
+}
+
+static inline void store32(void *dst, uint32_t w) {
+ uint8_t *p = (uint8_t *)dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+}
+
+static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
+ store32(&bytes_out[0 * 4], cv_words[0]);
+ store32(&bytes_out[1 * 4], cv_words[1]);
+ store32(&bytes_out[2 * 4], cv_words[2]);
+ store32(&bytes_out[3 * 4], cv_words[3]);
+ store32(&bytes_out[4 * 4], cv_words[4]);
+ store32(&bytes_out[5 * 4], cv_words[5]);
+ store32(&bytes_out[6 * 4], cv_words[6]);
+ store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_IMPL_H */
diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c
new file mode 100644
index 000000000..48715e212
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_x86-64.c
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include "blake3_impl.h"
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_sse2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_sse2_supported(void)
+{
+#if defined(__x86_64)
+ return (kfpu_allowed() && zfs_sse2_available());
+#elif defined(__PPC64__)
+ return (kfpu_allowed() && zfs_vsx_available());
+#else
+ return (kfpu_allowed());
+#endif
+}
+
+const blake3_impl_ops_t blake3_sse2_impl = {
+ .compress_in_place = blake3_compress_in_place_sse2,
+ .compress_xof = blake3_compress_xof_sse2,
+ .hash_many = blake3_hash_many_sse2,
+ .is_supported = blake3_is_sse2_supported,
+ .degree = 4,
+ .name = "sse2"
+};
+#endif
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_sse41(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_sse41_supported(void)
+{
+#if defined(__x86_64)
+ return (kfpu_allowed() && zfs_sse4_1_available());
+#elif defined(__PPC64__)
+ return (kfpu_allowed() && zfs_vsx_available());
+#else
+ return (kfpu_allowed());
+#endif
+}
+
+const blake3_impl_ops_t blake3_sse41_impl = {
+ .compress_in_place = blake3_compress_in_place_sse41,
+ .compress_xof = blake3_compress_xof_sse41,
+ .hash_many = blake3_hash_many_sse41,
+ .is_supported = blake3_is_sse41_supported,
+ .degree = 4,
+ .name = "sse41"
+};
+#endif
+
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_hash_many_avx2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_avx2_supported(void)
+{
+ return (kfpu_allowed() && zfs_sse4_1_available() &&
+ zfs_avx2_available());
+}
+
+const blake3_impl_ops_t blake3_avx2_impl = {
+ .compress_in_place = blake3_compress_in_place_sse41,
+ .compress_xof = blake3_compress_xof_sse41,
+ .hash_many = blake3_hash_many_avx2,
+ .is_supported = blake3_is_avx2_supported,
+ .degree = 8,
+ .name = "avx2"
+};
+#endif
+
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_avx512(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_avx512_supported(void)
+{
+ return (kfpu_allowed() && zfs_avx512f_available() &&
+ zfs_avx512vl_available());
+}
+
+const blake3_impl_ops_t blake3_avx512_impl = {
+ .compress_in_place = blake3_compress_in_place_avx512,
+ .compress_xof = blake3_compress_xof_avx512,
+ .hash_many = blake3_hash_many_avx512,
+ .is_supported = blake3_is_avx512_supported,
+ .degree = 16,
+ .name = "avx512"
+};
+#endif
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
new file mode 100644
index 000000000..59a4d9afd
--- /dev/null
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -0,0 +1,2450 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE2 -> ARMv8-A
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if defined(__aarch64__)
+ .text
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI0_1:
+ .xword 0
+ .xword -4294967296
+.LCPI0_2:
+ .xword -1
+ .xword 4294967295
+ .text
+ .globl zfs_blake3_compress_in_place_sse2
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse2,@function
+zfs_blake3_compress_in_place_sse2:
+ .cfi_startproc
+ ldp q3, q2, [x0]
+ ldp q5, q6, [x1]
+ add x10, x1, #32
+ lsr x11, x3, #32
+ fmov s4, w3
+ ld2 { v17.4s, v18.4s }, [x10]
+ adrp x10, .LCPI0_2
+ and w8, w2, #0xff
+ mov v4.s[1], w11
+ ldr q1, [x10, :lo12:.LCPI0_2]
+ and w9, w4, #0xff
+ adrp x12, .LCPI0_0
+ mov v4.s[2], w8
+ uzp1 v19.4s, v5.4s, v6.4s
+ add v3.4s, v2.4s, v3.4s
+ ldr q7, [x12, :lo12:.LCPI0_0]
+ mov v4.s[3], w9
+ add v3.4s, v3.4s, v19.4s
+ uzp2 v5.4s, v5.4s, v6.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ uzp1 v6.4s, v19.4s, v19.4s
+ ext v22.16b, v19.16b, v19.16b, #12
+ eor v4.16b, v3.16b, v4.16b
+ ext v20.16b, v17.16b, v17.16b, #12
+ ext v6.16b, v6.16b, v19.16b, #8
+ ext v19.16b, v19.16b, v22.16b, #12
+ zip1 v22.2d, v21.2d, v5.2d
+ rev32 v24.8h, v4.8h
+ mov v4.16b, v1.16b
+ zip2 v23.4s, v5.4s, v21.4s
+ uzp2 v6.4s, v6.4s, v5.4s
+ bsl v4.16b, v22.16b, v20.16b
+ add v3.4s, v3.4s, v5.4s
+ zip1 v5.4s, v23.4s, v20.4s
+ zip1 v22.4s, v20.4s, v23.4s
+ add v23.4s, v24.4s, v7.4s
+ ext v7.16b, v6.16b, v6.16b, #4
+ ext v25.16b, v4.16b, v4.16b, #12
+ ext v5.16b, v22.16b, v5.16b, #8
+ eor v2.16b, v23.16b, v2.16b
+ uzp1 v4.4s, v4.4s, v25.4s
+ uzp1 v22.4s, v7.4s, v7.4s
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v22.16b, v22.16b, v7.16b, #8
+ ext v7.16b, v7.16b, v25.16b, #12
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ add v3.4s, v3.4s, v2.4s
+ eor v24.16b, v3.16b, v24.16b
+ add v3.4s, v3.4s, v17.4s
+ ushr v17.4s, v24.4s, #8
+ shl v18.4s, v24.4s, #24
+ orr v17.16b, v18.16b, v17.16b
+ add v18.4s, v17.4s, v23.4s
+ eor v2.16b, v18.16b, v2.16b
+ ushr v23.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v3.16b, v3.16b, v3.16b, #12
+ orr v2.16b, v2.16b, v23.16b
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v3.4s, v2.4s, v3.4s
+ adrp x11, .LCPI0_1
+ eor v17.16b, v3.16b, v17.16b
+ ldr q16, [x11, :lo12:.LCPI0_1]
+ ext v18.16b, v18.16b, v18.16b, #4
+ rev32 v24.8h, v17.8h
+ movi v0.2d, #0xffffffff00000000
+ add v23.4s, v3.4s, v21.4s
+ mov v21.s[1], v20.s[2]
+ add v20.4s, v18.4s, v24.4s
+ bit v19.16b, v21.16b, v0.16b
+ eor v3.16b, v20.16b, v2.16b
+ uzp2 v2.4s, v22.4s, v19.4s
+ zip1 v17.2d, v5.2d, v19.2d
+ zip2 v18.4s, v19.4s, v5.4s
+ ushr v21.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ ext v22.16b, v2.16b, v2.16b, #4
+ bsl v16.16b, v4.16b, v17.16b
+ zip1 v17.4s, v18.4s, v4.4s
+ zip1 v18.4s, v4.4s, v18.4s
+ orr v21.16b, v3.16b, v21.16b
+ ext v25.16b, v16.16b, v16.16b, #12
+ ext v3.16b, v18.16b, v17.16b, #8
+ uzp1 v18.4s, v22.4s, v22.4s
+ ext v26.16b, v22.16b, v22.16b, #12
+ add v23.4s, v23.4s, v21.4s
+ uzp1 v17.4s, v16.4s, v25.4s
+ ext v16.16b, v18.16b, v22.16b, #8
+ ext v18.16b, v22.16b, v26.16b, #12
+ eor v22.16b, v23.16b, v24.16b
+ add v6.4s, v23.4s, v6.4s
+ ushr v23.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v23.16b
+ add v20.4s, v22.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v23.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v21.16b, v21.16b, v23.16b
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v21.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v20.16b, v20.16b, v20.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ rev32 v19.8h, v22.8h
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v22.4s, v21.4s, #12
+ shl v21.4s, v21.4s, #20
+ orr v21.16b, v21.16b, v22.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ushr v22.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v22.16b
+ add v20.4s, v19.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v22.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v21.16b, v21.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ext v20.16b, v20.16b, v20.16b, #4
+ rev32 v19.8h, v19.8h
+ add v20.4s, v20.4s, v19.4s
+ add v6.4s, v6.4s, v5.4s
+ mov v5.s[1], v4.s[2]
+ eor v4.16b, v20.16b, v21.16b
+ ushr v21.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v21.16b, v4.16b, v21.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ add v2.4s, v6.4s, v2.4s
+ ushr v6.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v6.4s, v20.4s
+ eor v20.16b, v19.16b, v21.16b
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v20.4s, v2.4s
+ eor v6.16b, v2.16b, v6.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v6.4s
+ mov v22.16b, v0.16b
+ eor v20.16b, v19.16b, v20.16b
+ bsl v22.16b, v5.16b, v7.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ add v2.4s, v2.4s, v22.4s
+ orr v20.16b, v20.16b, v21.16b
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ ushr v21.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v6.16b, v6.16b, v21.16b
+ add v19.4s, v6.4s, v19.4s
+ eor v20.16b, v19.16b, v20.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v2.4s, v2.4s, v17.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ uzp2 v5.4s, v16.4s, v22.4s
+ zip1 v7.2d, v3.2d, v22.2d
+ zip2 v16.4s, v22.4s, v3.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ rev32 v22.8h, v6.8h
+ ext v23.16b, v5.16b, v5.16b, #4
+ bif v7.16b, v17.16b, v1.16b
+ zip1 v24.4s, v16.4s, v17.4s
+ zip1 v16.4s, v17.4s, v16.4s
+ add v21.4s, v2.4s, v3.4s
+ mov v3.s[1], v17.s[2]
+ add v17.4s, v19.4s, v22.4s
+ mov v19.16b, v0.16b
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v4.16b, v16.16b, v24.16b, #8
+ uzp1 v16.4s, v23.4s, v23.4s
+ bsl v19.16b, v3.16b, v18.16b
+ eor v2.16b, v17.16b, v20.16b
+ uzp1 v7.4s, v7.4s, v25.4s
+ ext v25.16b, v16.16b, v23.16b, #8
+ zip1 v3.2d, v4.2d, v19.2d
+ ushr v20.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp2 v6.4s, v25.4s, v19.4s
+ zip2 v18.4s, v19.4s, v4.4s
+ bif v3.16b, v7.16b, v1.16b
+ orr v20.16b, v2.16b, v20.16b
+ ext v16.16b, v23.16b, v24.16b, #12
+ ext v23.16b, v6.16b, v6.16b, #4
+ zip1 v24.4s, v18.4s, v7.4s
+ zip1 v18.4s, v7.4s, v18.4s
+ ext v25.16b, v3.16b, v3.16b, #12
+ add v21.4s, v21.4s, v20.4s
+ ext v2.16b, v18.16b, v24.16b, #8
+ uzp1 v18.4s, v23.4s, v23.4s
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp1 v3.4s, v3.4s, v25.4s
+ eor v22.16b, v21.16b, v22.16b
+ ext v25.16b, v18.16b, v23.16b, #8
+ dup v18.4s, v2.s[3]
+ ext v23.16b, v23.16b, v24.16b, #12
+ add v5.4s, v21.4s, v5.4s
+ trn1 v21.4s, v3.4s, v3.4s
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ ext v18.16b, v21.16b, v18.16b, #8
+ orr v21.16b, v22.16b, v24.16b
+ add v17.4s, v21.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v22.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v20.16b, v22.16b
+ ext v21.16b, v21.16b, v21.16b, #8
+ add v5.4s, v20.4s, v5.4s
+ eor v21.16b, v5.16b, v21.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v5.4s, v5.4s, v19.4s
+ rev32 v19.8h, v21.8h
+ add v17.4s, v17.4s, v19.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v21.16b
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ushr v21.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v21.16b
+ add v17.4s, v19.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v5.4s, v5.4s, v7.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ rev32 v22.8h, v19.8h
+ add v21.4s, v5.4s, v4.4s
+ mov v4.s[1], v7.s[2]
+ add v19.4s, v17.4s, v22.4s
+ bit v16.16b, v4.16b, v0.16b
+ eor v5.16b, v19.16b, v20.16b
+ uzp2 v4.4s, v25.4s, v16.4s
+ zip1 v7.2d, v2.2d, v16.2d
+ zip2 v17.4s, v16.4s, v2.4s
+ ushr v20.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v24.16b, v4.16b, v4.16b, #4
+ bif v7.16b, v3.16b, v1.16b
+ zip1 v25.4s, v17.4s, v3.4s
+ zip1 v17.4s, v3.4s, v17.4s
+ orr v20.16b, v5.16b, v20.16b
+ ext v26.16b, v7.16b, v7.16b, #12
+ ext v5.16b, v17.16b, v25.16b, #8
+ uzp1 v17.4s, v24.4s, v24.4s
+ ext v25.16b, v24.16b, v24.16b, #12
+ bit v23.16b, v18.16b, v0.16b
+ add v21.4s, v21.4s, v20.4s
+ uzp1 v7.4s, v7.4s, v26.4s
+ ext v26.16b, v17.16b, v24.16b, #8
+ ext v17.16b, v24.16b, v25.16b, #12
+ eor v22.16b, v21.16b, v22.16b
+ add v6.4s, v21.4s, v6.4s
+ zip1 v21.2d, v5.2d, v23.2d
+ zip2 v24.4s, v23.4s, v5.4s
+ bif v21.16b, v7.16b, v1.16b
+ zip1 v1.4s, v24.4s, v7.4s
+ zip1 v24.4s, v7.4s, v24.4s
+ ext v1.16b, v24.16b, v1.16b, #8
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v24.16b
+ add v19.4s, v22.4s, v19.4s
+ ext v24.16b, v21.16b, v21.16b, #12
+ eor v20.16b, v19.16b, v20.16b
+ uzp1 v21.4s, v21.4s, v24.4s
+ ushr v24.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v20.16b, v20.16b, v24.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v20.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v6.4s, v6.4s, v16.4s
+ rev32 v16.8h, v22.8h
+ add v19.4s, v19.4s, v16.4s
+ eor v20.16b, v19.16b, v20.16b
+ ushr v22.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v22.16b
+ add v6.4s, v6.4s, v20.4s
+ eor v16.16b, v6.16b, v16.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v6.4s, v3.4s
+ ushr v6.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v6.16b, v16.16b, v6.16b
+ add v16.4s, v6.4s, v19.4s
+ eor v19.16b, v16.16b, v20.16b
+ ushr v20.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v20.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v3.4s, v3.4s, v19.4s
+ eor v6.16b, v3.16b, v6.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ add v2.4s, v3.4s, v2.4s
+ rev32 v3.8h, v6.8h
+ add v6.4s, v16.4s, v3.4s
+ eor v16.16b, v6.16b, v19.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ orr v16.16b, v16.16b, v19.16b
+ add v2.4s, v2.4s, v16.4s
+ eor v3.16b, v2.16b, v3.16b
+ add v2.4s, v2.4s, v4.4s
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v4.16b
+ add v4.4s, v3.4s, v6.4s
+ eor v6.16b, v4.16b, v16.16b
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v6.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v6.16b, v4.16b, v6.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ add v2.4s, v2.4s, v23.4s
+ orr v6.16b, v6.16b, v16.16b
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ add v4.4s, v3.4s, v4.4s
+ eor v6.16b, v4.16b, v6.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v2.4s, v2.4s, v7.4s
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ rev32 v3.8h, v3.8h
+ add v2.4s, v2.4s, v5.4s
+ mov v5.s[1], v7.s[2]
+ add v4.4s, v4.4s, v3.4s
+ bsl v0.16b, v5.16b, v17.16b
+ eor v5.16b, v4.16b, v6.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v6.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v6.16b
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v18.4s, v26.4s, v18.4s
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v18.4s
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v5.16b, v5.16b, v6.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v5.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v0.4s, v2.4s, v0.4s
+ rev32 v2.8h, v3.8h
+ add v3.4s, v4.4s, v2.4s
+ eor v4.16b, v3.16b, v5.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v4.16b, v4.16b, v5.16b
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ushr v5.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v2.16b, v2.16b, v5.16b
+ add v3.4s, v2.4s, v3.4s
+ eor v4.16b, v3.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #12
+ ushr v5.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v0.4s, v0.4s, v21.4s
+ orr v4.16b, v4.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ add v0.4s, v0.4s, v1.4s
+ rev32 v1.8h, v2.8h
+ add v2.4s, v3.4s, v1.4s
+ eor v3.16b, v2.16b, v4.16b
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v1.16b, v0.16b, v1.16b
+ ushr v4.4s, v1.4s, #8
+ shl v1.4s, v1.4s, #24
+ orr v1.16b, v1.16b, v4.16b
+ add v2.4s, v1.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v0.16b, v2.16b, v0.16b
+ orr v2.16b, v3.16b, v4.16b
+ eor v1.16b, v2.16b, v1.16b
+ stp q0, q1, [x0]
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI1_1:
+ .xword 0
+ .xword -4294967296
+.LCPI1_2:
+ .xword -1
+ .xword 4294967295
+ .text
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+ .cfi_startproc
+ ldp q3, q2, [x0]
+ ldp q5, q6, [x1]
+ add x10, x1, #32
+ lsr x11, x3, #32
+ fmov s4, w3
+ ld2 { v17.4s, v18.4s }, [x10]
+ adrp x10, .LCPI1_2
+ and w8, w2, #0xff
+ mov v4.s[1], w11
+ ldr q1, [x10, :lo12:.LCPI1_2]
+ and w9, w4, #0xff
+ adrp x12, .LCPI1_0
+ mov v4.s[2], w8
+ uzp1 v19.4s, v5.4s, v6.4s
+ add v3.4s, v2.4s, v3.4s
+ ldr q7, [x12, :lo12:.LCPI1_0]
+ mov v4.s[3], w9
+ add v3.4s, v3.4s, v19.4s
+ uzp2 v5.4s, v5.4s, v6.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ uzp1 v6.4s, v19.4s, v19.4s
+ ext v22.16b, v19.16b, v19.16b, #12
+ eor v4.16b, v3.16b, v4.16b
+ ext v20.16b, v17.16b, v17.16b, #12
+ ext v6.16b, v6.16b, v19.16b, #8
+ ext v19.16b, v19.16b, v22.16b, #12
+ zip1 v22.2d, v21.2d, v5.2d
+ rev32 v24.8h, v4.8h
+ mov v4.16b, v1.16b
+ zip2 v23.4s, v5.4s, v21.4s
+ uzp2 v6.4s, v6.4s, v5.4s
+ bsl v4.16b, v22.16b, v20.16b
+ add v3.4s, v3.4s, v5.4s
+ zip1 v5.4s, v23.4s, v20.4s
+ zip1 v22.4s, v20.4s, v23.4s
+ add v23.4s, v24.4s, v7.4s
+ ext v7.16b, v6.16b, v6.16b, #4
+ ext v25.16b, v4.16b, v4.16b, #12
+ ext v5.16b, v22.16b, v5.16b, #8
+ eor v2.16b, v23.16b, v2.16b
+ uzp1 v4.4s, v4.4s, v25.4s
+ uzp1 v22.4s, v7.4s, v7.4s
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v22.16b, v22.16b, v7.16b, #8
+ ext v7.16b, v7.16b, v25.16b, #12
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ add v3.4s, v3.4s, v2.4s
+ eor v24.16b, v3.16b, v24.16b
+ add v3.4s, v3.4s, v17.4s
+ ushr v17.4s, v24.4s, #8
+ shl v18.4s, v24.4s, #24
+ orr v17.16b, v18.16b, v17.16b
+ add v18.4s, v17.4s, v23.4s
+ eor v2.16b, v18.16b, v2.16b
+ ushr v23.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v3.16b, v3.16b, v3.16b, #12
+ orr v2.16b, v2.16b, v23.16b
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v3.4s, v2.4s, v3.4s
+ adrp x11, .LCPI1_1
+ eor v17.16b, v3.16b, v17.16b
+ ldr q16, [x11, :lo12:.LCPI1_1]
+ ext v18.16b, v18.16b, v18.16b, #4
+ rev32 v24.8h, v17.8h
+ movi v0.2d, #0xffffffff00000000
+ add v23.4s, v3.4s, v21.4s
+ mov v21.s[1], v20.s[2]
+ add v20.4s, v18.4s, v24.4s
+ bit v19.16b, v21.16b, v0.16b
+ eor v3.16b, v20.16b, v2.16b
+ uzp2 v2.4s, v22.4s, v19.4s
+ zip1 v17.2d, v5.2d, v19.2d
+ zip2 v18.4s, v19.4s, v5.4s
+ ushr v21.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ ext v22.16b, v2.16b, v2.16b, #4
+ bsl v16.16b, v4.16b, v17.16b
+ zip1 v17.4s, v18.4s, v4.4s
+ zip1 v18.4s, v4.4s, v18.4s
+ orr v21.16b, v3.16b, v21.16b
+ ext v25.16b, v16.16b, v16.16b, #12
+ ext v3.16b, v18.16b, v17.16b, #8
+ uzp1 v18.4s, v22.4s, v22.4s
+ ext v26.16b, v22.16b, v22.16b, #12
+ add v23.4s, v23.4s, v21.4s
+ uzp1 v17.4s, v16.4s, v25.4s
+ ext v16.16b, v18.16b, v22.16b, #8
+ ext v18.16b, v22.16b, v26.16b, #12
+ eor v22.16b, v23.16b, v24.16b
+ add v6.4s, v23.4s, v6.4s
+ ushr v23.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v23.16b
+ add v20.4s, v22.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v23.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v21.16b, v21.16b, v23.16b
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v21.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v20.16b, v20.16b, v20.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ rev32 v19.8h, v22.8h
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v22.4s, v21.4s, #12
+ shl v21.4s, v21.4s, #20
+ orr v21.16b, v21.16b, v22.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ushr v22.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v22.16b
+ add v20.4s, v19.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v22.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v21.16b, v21.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ext v20.16b, v20.16b, v20.16b, #4
+ rev32 v19.8h, v19.8h
+ add v20.4s, v20.4s, v19.4s
+ add v6.4s, v6.4s, v5.4s
+ mov v5.s[1], v4.s[2]
+ eor v4.16b, v20.16b, v21.16b
+ ushr v21.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v21.16b, v4.16b, v21.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ add v2.4s, v6.4s, v2.4s
+ ushr v6.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v6.4s, v20.4s
+ eor v20.16b, v19.16b, v21.16b
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v20.4s, v2.4s
+ eor v6.16b, v2.16b, v6.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v6.4s
+ mov v22.16b, v0.16b
+ eor v20.16b, v19.16b, v20.16b
+ bsl v22.16b, v5.16b, v7.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ add v2.4s, v2.4s, v22.4s
+ orr v20.16b, v20.16b, v21.16b
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ ushr v21.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v6.16b, v6.16b, v21.16b
+ add v19.4s, v6.4s, v19.4s
+ eor v20.16b, v19.16b, v20.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v2.4s, v2.4s, v17.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ uzp2 v5.4s, v16.4s, v22.4s
+ zip1 v7.2d, v3.2d, v22.2d
+ zip2 v16.4s, v22.4s, v3.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ rev32 v22.8h, v6.8h
+ ext v23.16b, v5.16b, v5.16b, #4
+ bif v7.16b, v17.16b, v1.16b
+ zip1 v24.4s, v16.4s, v17.4s
+ zip1 v16.4s, v17.4s, v16.4s
+ add v21.4s, v2.4s, v3.4s
+ mov v3.s[1], v17.s[2]
+ add v17.4s, v19.4s, v22.4s
+ mov v19.16b, v0.16b
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v4.16b, v16.16b, v24.16b, #8
+ uzp1 v16.4s, v23.4s, v23.4s
+ bsl v19.16b, v3.16b, v18.16b
+ eor v2.16b, v17.16b, v20.16b
+ uzp1 v7.4s, v7.4s, v25.4s
+ ext v25.16b, v16.16b, v23.16b, #8
+ zip1 v3.2d, v4.2d, v19.2d
+ ushr v20.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp2 v6.4s, v25.4s, v19.4s
+ zip2 v18.4s, v19.4s, v4.4s
+ bif v3.16b, v7.16b, v1.16b
+ orr v20.16b, v2.16b, v20.16b
+ ext v16.16b, v23.16b, v24.16b, #12
+ ext v23.16b, v6.16b, v6.16b, #4
+ zip1 v24.4s, v18.4s, v7.4s
+ zip1 v18.4s, v7.4s, v18.4s
+ ext v25.16b, v3.16b, v3.16b, #12
+ add v21.4s, v21.4s, v20.4s
+ ext v2.16b, v18.16b, v24.16b, #8
+ uzp1 v18.4s, v23.4s, v23.4s
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp1 v3.4s, v3.4s, v25.4s
+ eor v22.16b, v21.16b, v22.16b
+ ext v25.16b, v18.16b, v23.16b, #8
+ dup v18.4s, v2.s[3]
+ ext v23.16b, v23.16b, v24.16b, #12
+ add v5.4s, v21.4s, v5.4s
+ trn1 v21.4s, v3.4s, v3.4s
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ ext v18.16b, v21.16b, v18.16b, #8
+ orr v21.16b, v22.16b, v24.16b
+ add v17.4s, v21.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v22.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v20.16b, v22.16b
+ ext v21.16b, v21.16b, v21.16b, #8
+ add v5.4s, v20.4s, v5.4s
+ eor v21.16b, v5.16b, v21.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v5.4s, v5.4s, v19.4s
+ rev32 v19.8h, v21.8h
+ add v17.4s, v17.4s, v19.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v21.16b
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ushr v21.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v21.16b
+ add v17.4s, v19.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v5.4s, v5.4s, v7.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ rev32 v22.8h, v19.8h
+ add v21.4s, v5.4s, v4.4s
+ mov v4.s[1], v7.s[2]
+ add v19.4s, v17.4s, v22.4s
+ bit v16.16b, v4.16b, v0.16b
+ eor v5.16b, v19.16b, v20.16b
+ uzp2 v4.4s, v25.4s, v16.4s
+ zip1 v7.2d, v2.2d, v16.2d
+ zip2 v17.4s, v16.4s, v2.4s
+ ushr v20.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v24.16b, v4.16b, v4.16b, #4
+ bif v7.16b, v3.16b, v1.16b
+ zip1 v25.4s, v17.4s, v3.4s
+ zip1 v17.4s, v3.4s, v17.4s
+ orr v20.16b, v5.16b, v20.16b
+ ext v26.16b, v7.16b, v7.16b, #12
+ ext v5.16b, v17.16b, v25.16b, #8
+ uzp1 v17.4s, v24.4s, v24.4s
+ ext v25.16b, v24.16b, v24.16b, #12
+ bit v23.16b, v18.16b, v0.16b
+ add v21.4s, v21.4s, v20.4s
+ uzp1 v7.4s, v7.4s, v26.4s
+ ext v26.16b, v17.16b, v24.16b, #8
+ ext v17.16b, v24.16b, v25.16b, #12
+ eor v22.16b, v21.16b, v22.16b
+ add v6.4s, v21.4s, v6.4s
+ zip1 v21.2d, v5.2d, v23.2d
+ zip2 v24.4s, v23.4s, v5.4s
+ bif v21.16b, v7.16b, v1.16b
+ zip1 v1.4s, v24.4s, v7.4s
+ zip1 v24.4s, v7.4s, v24.4s
+ ext v1.16b, v24.16b, v1.16b, #8
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v24.16b
+ add v19.4s, v22.4s, v19.4s
+ ext v24.16b, v21.16b, v21.16b, #12
+ eor v20.16b, v19.16b, v20.16b
+ uzp1 v21.4s, v21.4s, v24.4s
+ ushr v24.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v20.16b, v20.16b, v24.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v20.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v6.4s, v6.4s, v16.4s
+ rev32 v16.8h, v22.8h
+ add v19.4s, v19.4s, v16.4s
+ eor v20.16b, v19.16b, v20.16b
+ ushr v22.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v22.16b
+ add v6.4s, v6.4s, v20.4s
+ eor v16.16b, v6.16b, v16.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v6.4s, v3.4s
+ ushr v6.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v6.16b, v16.16b, v6.16b
+ add v16.4s, v6.4s, v19.4s
+ eor v19.16b, v16.16b, v20.16b
+ ushr v20.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v20.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v3.4s, v3.4s, v19.4s
+ eor v6.16b, v3.16b, v6.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ add v2.4s, v3.4s, v2.4s
+ rev32 v3.8h, v6.8h
+ add v6.4s, v16.4s, v3.4s
+ eor v16.16b, v6.16b, v19.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ orr v16.16b, v16.16b, v19.16b
+ add v2.4s, v2.4s, v16.4s
+ eor v3.16b, v2.16b, v3.16b
+ add v2.4s, v2.4s, v4.4s
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v4.16b
+ add v4.4s, v3.4s, v6.4s
+ eor v6.16b, v4.16b, v16.16b
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v6.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v6.16b, v4.16b, v6.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ add v2.4s, v2.4s, v23.4s
+ orr v6.16b, v6.16b, v16.16b
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ add v4.4s, v3.4s, v4.4s
+ eor v6.16b, v4.16b, v6.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v2.4s, v2.4s, v7.4s
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ rev32 v3.8h, v3.8h
+ add v2.4s, v2.4s, v5.4s
+ mov v5.s[1], v7.s[2]
+ add v4.4s, v4.4s, v3.4s
+ bsl v0.16b, v5.16b, v17.16b
+ eor v5.16b, v4.16b, v6.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v6.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v6.16b
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v18.4s, v26.4s, v18.4s
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v18.4s
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v5.16b, v5.16b, v6.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v5.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v0.4s, v2.4s, v0.4s
+ rev32 v2.8h, v3.8h
+ add v3.4s, v4.4s, v2.4s
+ eor v4.16b, v3.16b, v5.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v4.16b, v4.16b, v5.16b
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ushr v5.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v2.16b, v2.16b, v5.16b
+ add v3.4s, v2.4s, v3.4s
+ eor v4.16b, v3.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #12
+ ushr v5.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v0.4s, v0.4s, v21.4s
+ orr v4.16b, v4.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ add v0.4s, v0.4s, v1.4s
+ rev32 v1.8h, v2.8h
+ add v2.4s, v3.4s, v1.4s
+ eor v3.16b, v2.16b, v4.16b
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v1.16b, v0.16b, v1.16b
+ ushr v4.4s, v1.4s, #8
+ shl v1.4s, v1.4s, #24
+ orr v1.16b, v1.16b, v4.16b
+ add v2.4s, v1.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v1.16b, v1.16b, v1.16b, #8
+ ext v2.16b, v2.16b, v2.16b, #12
+ orr v3.16b, v3.16b, v4.16b
+ eor v0.16b, v2.16b, v0.16b
+ eor v3.16b, v3.16b, v1.16b
+ stp q0, q3, [x5]
+ ldr q0, [x0]
+ eor v0.16b, v0.16b, v2.16b
+ str q0, [x5, #32]
+ ldr q0, [x0, #16]
+ eor v0.16b, v0.16b, v1.16b
+ str q0, [x5, #48]
+ ret
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI2_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+ .text
+ .globl zfs_blake3_hash_many_sse2
+ .p2align 2
+ .type zfs_blake3_hash_many_sse2,@function
+zfs_blake3_hash_many_sse2:
+ .cfi_startproc
+ stp d15, d14, [sp, #-160]!
+ stp d13, d12, [sp, #16]
+ stp d11, d10, [sp, #32]
+ stp d9, d8, [sp, #48]
+ stp x29, x30, [sp, #64]
+ stp x28, x27, [sp, #80]
+ stp x26, x25, [sp, #96]
+ stp x24, x23, [sp, #112]
+ stp x22, x21, [sp, #128]
+ stp x20, x19, [sp, #144]
+ mov x29, sp
+ sub sp, sp, #384
+ .cfi_def_cfa w29, 160
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w21, -24
+ .cfi_offset w22, -32
+ .cfi_offset w23, -40
+ .cfi_offset w24, -48
+ .cfi_offset w25, -56
+ .cfi_offset w26, -64
+ .cfi_offset w27, -72
+ .cfi_offset w28, -80
+ .cfi_offset w30, -88
+ .cfi_offset w29, -96
+ .cfi_offset b8, -104
+ .cfi_offset b9, -112
+ .cfi_offset b10, -120
+ .cfi_offset b11, -128
+ .cfi_offset b12, -136
+ .cfi_offset b13, -144
+ .cfi_offset b14, -152
+ .cfi_offset b15, -160
+ ldr x26, [x29, #168]
+ ldrb w27, [x29, #160]
+ mov w19, w6
+ mov x20, x4
+ mov x22, x2
+ mov x28, x1
+ cmp x1, #4
+ mov x24, x0
+ str x3, [sp, #40]
+ b.lo .LBB2_8
+ adrp x9, .LCPI2_0
+ ldr q0, [x9, :lo12:.LCPI2_0]
+ sbfx w11, w5, #0, #1
+ dup v1.4s, w11
+ mov w9, #58983
+ mov w10, #44677
+ and v0.16b, v1.16b, v0.16b
+ mov w11, #62322
+ mov w12, #62778
+ orr w8, w7, w19
+ movk w9, #27145, lsl #16
+ movk w10, #47975, lsl #16
+ movk w11, #15470, lsl #16
+ str q0, [sp, #16]
+ orr v0.4s, #128, lsl #24
+ movk w12, #42319, lsl #16
+ str q0, [sp]
+.LBB2_2:
+ ldr x0, [sp, #40]
+ mov x13, x0
+ ld1r { v20.4s }, [x13], #4
+ add x14, x0, #8
+ add x15, x0, #12
+ add x16, x0, #16
+ add x17, x0, #20
+ add x18, x0, #24
+ add x0, x0, #28
+ ld1r { v17.4s }, [x14]
+ ld1r { v6.4s }, [x15]
+ ld1r { v8.4s }, [x16]
+ ld1r { v9.4s }, [x17]
+ ld1r { v31.4s }, [x18]
+ ld1r { v26.4s }, [x13]
+ ld1r { v15.4s }, [x0]
+ cbz x22, .LBB2_7
+ ldr q1, [sp, #16]
+ dup v0.4s, w20
+ ldp x13, x14, [x24]
+ ldp x15, x16, [x24, #16]
+ add v1.4s, v0.4s, v1.4s
+ movi v0.4s, #128, lsl #24
+ str q1, [sp, #64]
+ eor v0.16b, v1.16b, v0.16b
+ ldr q1, [sp]
+ lsr x18, x20, #32
+ mov x17, xzr
+ cmgt v0.4s, v1.4s, v0.4s
+ dup v1.4s, w18
+ sub v0.4s, v1.4s, v0.4s
+ mov w18, w8
+ str q0, [sp, #48]
+.LBB2_4:
+ mov w2, #16
+ bfi x2, x17, #6, #58
+ ldr q1, [x13, x2]
+ ldr q3, [x14, x2]
+ ldr q2, [x15, x2]
+ ldr q4, [x16, x2]
+ mov w2, #32
+ bfi x2, x17, #6, #58
+ ldr q5, [x13, x2]
+ ldr q18, [x14, x2]
+ ldr q19, [x15, x2]
+ ldr q23, [x16, x2]
+ mov w2, #48
+ lsl x3, x17, #6
+ bfi x2, x17, #6, #58
+ add x17, x17, #1
+ ldr q0, [x13, x3]
+ ldr q21, [x14, x3]
+ ldr q7, [x15, x3]
+ ldr q16, [x16, x3]
+ cmp x17, x22
+ ldr q13, [x13, x2]
+ ldr q14, [x14, x2]
+ ldr q29, [x15, x2]
+ ldr q10, [x16, x2]
+ csel w2, w27, wzr, eq
+ orr w18, w2, w18
+ mov x0, xzr
+ and w18, w18, #0xff
+ add x3, x3, #256
+.LBB2_5:
+ ldr x2, [x24, x0]
+ add x0, x0, #8
+ cmp x0, #32
+ add x2, x2, x3
+ prfm pldl1keep, [x2]
+ b.ne .LBB2_5
+ dup v22.4s, w18
+ str q22, [sp, #192]
+ zip1 v27.4s, v0.4s, v21.4s
+ zip2 v21.4s, v0.4s, v21.4s
+ zip1 v0.4s, v7.4s, v16.4s
+ zip2 v22.4s, v7.4s, v16.4s
+ zip1 v7.4s, v1.4s, v3.4s
+ zip1 v25.4s, v2.4s, v4.4s
+ zip2 v16.4s, v2.4s, v4.4s
+ zip1 v11.4s, v19.4s, v23.4s
+ zip2 v12.4s, v19.4s, v23.4s
+ zip1 v19.4s, v13.4s, v14.4s
+ zip2 v23.4s, v13.4s, v14.4s
+ zip1 v13.4s, v29.4s, v10.4s
+ zip2 v14.4s, v29.4s, v10.4s
+ add v10.4s, v20.4s, v8.4s
+ add v2.4s, v26.4s, v9.4s
+ ext v20.16b, v22.16b, v21.16b, #8
+ ext v26.16b, v25.16b, v7.16b, #8
+ zip2 v24.4s, v1.4s, v3.4s
+ add v1.4s, v6.4s, v15.4s
+ ext v6.16b, v0.16b, v27.16b, #8
+ ext v20.16b, v21.16b, v20.16b, #8
+ mov v21.d[1], v22.d[0]
+ ext v22.16b, v7.16b, v26.16b, #8
+ mov v7.d[1], v25.d[0]
+ add v3.4s, v17.4s, v31.4s
+ str q1, [sp, #144]
+ ext v1.16b, v27.16b, v6.16b, #8
+ mov v6.16b, v7.16b
+ zip1 v28.4s, v5.4s, v18.4s
+ stur q1, [x29, #-80]
+ mov v1.16b, v27.16b
+ mov v27.16b, v24.16b
+ add v3.4s, v3.4s, v6.4s
+ ldr q6, [sp, #64]
+ ext v29.16b, v16.16b, v24.16b, #8
+ mov v1.d[1], v0.d[0]
+ ext v0.16b, v11.16b, v28.16b, #8
+ mov v27.d[1], v16.d[0]
+ ext v16.16b, v14.16b, v23.16b, #8
+ stur q7, [x29, #-144]
+ ext v7.16b, v24.16b, v29.16b, #8
+ ext v29.16b, v28.16b, v0.16b, #8
+ ext v0.16b, v23.16b, v16.16b, #8
+ mov v23.d[1], v14.d[0]
+ stp q0, q23, [sp, #80]
+ add v0.4s, v10.4s, v1.4s
+ eor v16.16b, v0.16b, v6.16b
+ ldr q6, [sp, #48]
+ add v2.4s, v2.4s, v21.4s
+ mov v28.d[1], v11.d[0]
+ zip2 v18.4s, v5.4s, v18.4s
+ eor v10.16b, v2.16b, v6.16b
+ movi v6.4s, #64
+ eor v11.16b, v3.16b, v6.16b
+ ldr q6, [sp, #144]
+ dup v17.4s, w9
+ ext v30.16b, v12.16b, v18.16b, #8
+ rev32 v16.8h, v16.8h
+ dup v5.4s, w10
+ ext v25.16b, v18.16b, v30.16b, #8
+ mov v30.16b, v23.16b
+ mov v23.16b, v1.16b
+ str q1, [sp, #160]
+ rev32 v10.8h, v10.8h
+ add v1.4s, v16.4s, v17.4s
+ add v17.4s, v6.4s, v27.4s
+ ldr q6, [sp, #192]
+ dup v4.4s, w11
+ rev32 v11.8h, v11.8h
+ add v5.4s, v10.4s, v5.4s
+ eor v8.16b, v1.16b, v8.16b
+ stur q21, [x29, #-128]
+ mov v18.d[1], v12.d[0]
+ add v4.4s, v11.4s, v4.4s
+ eor v9.16b, v5.16b, v9.16b
+ ushr v12.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ ldur q21, [x29, #-80]
+ ext v26.16b, v13.16b, v19.16b, #8
+ eor v31.16b, v4.16b, v31.16b
+ orr v8.16b, v8.16b, v12.16b
+ ushr v12.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ ext v26.16b, v19.16b, v26.16b, #8
+ mov v19.d[1], v13.d[0]
+ orr v9.16b, v9.16b, v12.16b
+ ushr v12.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v17.16b, v6.16b
+ orr v31.16b, v31.16b, v12.16b
+ dup v12.4s, w12
+ rev32 v13.8h, v13.8h
+ add v12.4s, v13.4s, v12.4s
+ add v0.4s, v0.4s, v21.4s
+ eor v14.16b, v12.16b, v15.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v22.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v28.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v18.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v19.4s
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v30.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ mov v24.16b, v7.16b
+ stur q7, [x29, #-112]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ mov v7.16b, v26.16b
+ add v3.4s, v3.4s, v26.4s
+ ldr q26, [sp, #80]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ add v0.4s, v0.4s, v29.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v25.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v13.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ str q22, [sp, #128]
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ ldur q22, [x29, #-128]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ mov v6.16b, v18.16b
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ ldur q18, [x29, #-144]
+ orr v8.16b, v8.16b, v15.16b
+ add v0.4s, v0.4s, v22.4s
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v24.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v18.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v13.16b, v17.16b, v13.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v13.8h, v13.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v13.4s
+ add v0.4s, v0.4s, v27.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v6.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v23.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v21.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v19.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v29.4s
+ str q28, [sp, #112]
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldp q28, q23, [sp, #112]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ldr q21, [sp, #96]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ add v0.4s, v0.4s, v25.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v23.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v21.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v28.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v13.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ mov v30.16b, v29.16b
+ mov v29.16b, v25.16b
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ ldur q25, [x29, #-112]
+ orr v8.16b, v8.16b, v15.16b
+ add v0.4s, v0.4s, v20.4s
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v7.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v25.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v13.16b, v17.16b, v13.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v13.8h, v13.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v13.4s
+ add v0.4s, v0.4s, v18.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v19.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v22.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v21.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v27.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v29.4s
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v28.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ ldr q24, [sp, #160]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ stur q7, [x29, #-64]
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ mov v7.16b, v26.16b
+ add v3.4s, v3.4s, v26.4s
+ ldur q26, [x29, #-80]
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ add v0.4s, v0.4s, v23.4s
+ orr v8.16b, v8.16b, v15.16b
+ add v15.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v24.4s
+ eor v0.16b, v15.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ ushr v13.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v0.16b, v0.16b, v13.16b
+ ushr v13.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v13.16b
+ ushr v13.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v13.16b
+ ushr v13.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v13.16b
+ ushr v13.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ orr v9.16b, v9.16b, v13.16b
+ ushr v13.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ add v1.4s, v10.4s, v1.4s
+ orr v31.16b, v31.16b, v13.16b
+ eor v13.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ ushr v14.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v14.16b
+ ushr v14.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ stur q6, [x29, #-96]
+ orr v8.16b, v8.16b, v14.16b
+ add v14.4s, v15.4s, v6.4s
+ ldur q6, [x29, #-64]
+ mov v18.16b, v19.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v18.4s
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v21.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v6.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ str q27, [sp, #176]
+ mov v27.16b, v30.16b
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ add v14.4s, v14.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v27.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v20.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ mov v30.16b, v23.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v0.16b, v17.16b, v0.16b
+ add v1.4s, v16.4s, v1.4s
+ ldur q23, [x29, #-144]
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v0.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v23.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v29.4s
+ orr v13.16b, v13.16b, v15.16b
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v30.4s
+ rev32 v0.8h, v0.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldur q22, [x29, #-128]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ ldr q26, [sp, #176]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v5.4s, v11.4s
+ add v14.4s, v14.4s, v24.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v22.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v28.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v15.16b
+ add v14.4s, v14.4s, v18.4s
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v27.4s
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v7.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v21.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ add v14.4s, v14.4s, v6.4s
+ ldur q6, [x29, #-96]
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ stur q20, [x29, #-160]
+ mov v20.16b, v29.16b
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ mov v19.16b, v29.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ mov v19.16b, v28.16b
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v6.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v19.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v0.16b, v17.16b, v0.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v0.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v25.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v13.16b, v13.16b, v15.16b
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v24.4s
+ rev32 v0.8h, v0.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ mov v29.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldur q27, [x29, #-160]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ldur q6, [x29, #-80]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v5.4s, v11.4s
+ add v14.4s, v14.4s, v22.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v27.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v6.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v23.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v15.16b
+ add v14.4s, v14.4s, v29.4s
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ mov v28.16b, v7.16b
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v19.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v28.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ add v14.4s, v14.4s, v21.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ add v3.4s, v3.4s, v18.4s
+ orr v10.16b, v10.16b, v15.16b
+ add v15.4s, v3.4s, v31.4s
+ eor v3.16b, v15.16b, v11.16b
+ ushr v11.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v11.16b, v3.16b, v11.16b
+ add v3.4s, v17.4s, v6.4s
+ add v17.4s, v3.4s, v13.4s
+ eor v0.16b, v17.16b, v0.16b
+ ushr v3.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ add v1.4s, v16.4s, v1.4s
+ orr v0.16b, v0.16b, v3.16b
+ eor v3.16b, v1.16b, v8.16b
+ ushr v8.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ add v5.4s, v10.4s, v5.4s
+ orr v8.16b, v3.16b, v8.16b
+ eor v3.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ ushr v9.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ mov v7.16b, v23.16b
+ mov v23.16b, v28.16b
+ mov v28.16b, v6.16b
+ orr v3.16b, v3.16b, v9.16b
+ ushr v9.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ ldur q6, [x29, #-64]
+ orr v31.16b, v31.16b, v9.16b
+ add v9.4s, v0.4s, v12.4s
+ eor v12.16b, v9.16b, v13.16b
+ ushr v13.4s, v12.4s, #7
+ shl v12.4s, v12.4s, #25
+ orr v12.16b, v12.16b, v13.16b
+ add v13.4s, v14.4s, v6.4s
+ add v13.4s, v13.4s, v3.4s
+ eor v0.16b, v13.16b, v0.16b
+ add v2.4s, v2.4s, v24.4s
+ rev32 v14.8h, v0.8h
+ add v0.4s, v2.4s, v31.4s
+ add v6.4s, v4.4s, v14.4s
+ eor v2.16b, v0.16b, v16.16b
+ eor v3.16b, v6.16b, v3.16b
+ rev32 v16.8h, v2.8h
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v2.4s, v9.4s, v16.4s
+ orr v4.16b, v3.16b, v4.16b
+ eor v3.16b, v2.16b, v31.16b
+ ushr v31.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v31.16b
+ add v31.4s, v15.4s, v22.4s
+ add v31.4s, v31.4s, v12.4s
+ add v17.4s, v17.4s, v7.4s
+ eor v9.16b, v31.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ rev32 v9.8h, v9.8h
+ eor v11.16b, v17.16b, v11.16b
+ add v1.4s, v1.4s, v9.4s
+ rev32 v11.8h, v11.8h
+ eor v10.16b, v1.16b, v12.16b
+ add v5.4s, v5.4s, v11.4s
+ ushr v12.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ orr v10.16b, v10.16b, v12.16b
+ ushr v12.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ orr v8.16b, v8.16b, v12.16b
+ add v12.4s, v13.4s, v27.4s
+ add v12.4s, v12.4s, v4.4s
+ eor v13.16b, v12.16b, v14.16b
+ ldur q14, [x29, #-96]
+ mov v25.16b, v29.16b
+ add v29.4s, v12.4s, v20.4s
+ add v20.4s, v31.4s, v26.4s
+ add v0.4s, v0.4s, v14.4s
+ add v0.4s, v0.4s, v3.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v0.4s, v0.4s, v30.4s
+ ldur q30, [x29, #-112]
+ add v20.4s, v20.4s, v10.4s
+ eor v31.16b, v20.16b, v9.16b
+ add v20.4s, v20.4s, v28.4s
+ add v17.4s, v17.4s, v30.4s
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v17.16b, v11.16b
+ ushr v28.4s, v13.4s, #8
+ shl v11.4s, v13.4s, #24
+ orr v28.16b, v11.16b, v28.16b
+ ushr v11.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v16.16b, v16.16b, v11.16b
+ ushr v11.4s, v31.4s, #8
+ shl v31.4s, v31.4s, #24
+ add v6.4s, v28.4s, v6.4s
+ orr v31.16b, v31.16b, v11.16b
+ ushr v11.4s, v9.4s, #8
+ shl v9.4s, v9.4s, #24
+ add v2.4s, v16.4s, v2.4s
+ eor v4.16b, v6.16b, v4.16b
+ orr v9.16b, v9.16b, v11.16b
+ add v1.4s, v31.4s, v1.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v11.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v5.4s, v9.4s, v5.4s
+ eor v10.16b, v1.16b, v10.16b
+ orr v4.16b, v4.16b, v11.16b
+ ushr v11.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v3.16b, v3.16b, v11.16b
+ ushr v11.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ orr v10.16b, v10.16b, v11.16b
+ ushr v11.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v11.16b
+ add v29.4s, v29.4s, v8.4s
+ eor v16.16b, v29.16b, v16.16b
+ add v0.4s, v0.4s, v4.4s
+ mov v12.16b, v26.16b
+ add v17.4s, v17.4s, v19.4s
+ add v26.4s, v29.4s, v23.4s
+ eor v29.16b, v0.16b, v31.16b
+ add v20.4s, v20.4s, v3.4s
+ rev32 v16.8h, v16.8h
+ stur q18, [x29, #-176]
+ mov v18.16b, v27.16b
+ add v0.4s, v0.4s, v24.4s
+ eor v27.16b, v20.16b, v9.16b
+ add v17.4s, v17.4s, v10.4s
+ rev32 v24.8h, v29.8h
+ add v1.4s, v1.4s, v16.4s
+ add v20.4s, v20.4s, v25.4s
+ eor v25.16b, v17.16b, v28.16b
+ rev32 v27.8h, v27.8h
+ add v5.4s, v5.4s, v24.4s
+ eor v28.16b, v1.16b, v8.16b
+ rev32 v25.8h, v25.8h
+ add v6.4s, v6.4s, v27.4s
+ eor v4.16b, v5.16b, v4.16b
+ ushr v31.4s, v28.4s, #12
+ shl v28.4s, v28.4s, #20
+ add v2.4s, v2.4s, v25.4s
+ eor v3.16b, v6.16b, v3.16b
+ orr v28.16b, v28.16b, v31.16b
+ ushr v31.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ eor v29.16b, v2.16b, v10.16b
+ orr v4.16b, v4.16b, v31.16b
+ ushr v31.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v26.4s, v26.4s, v28.4s
+ orr v3.16b, v3.16b, v31.16b
+ ushr v31.4s, v29.4s, #12
+ shl v29.4s, v29.4s, #20
+ eor v16.16b, v26.16b, v16.16b
+ add v0.4s, v0.4s, v4.4s
+ add v17.4s, v17.4s, v12.4s
+ orr v29.16b, v29.16b, v31.16b
+ eor v24.16b, v0.16b, v24.16b
+ add v0.4s, v0.4s, v22.4s
+ add v20.4s, v20.4s, v3.4s
+ ushr v22.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ add v23.4s, v26.4s, v21.4s
+ eor v21.16b, v20.16b, v27.16b
+ add v17.4s, v17.4s, v29.4s
+ orr v16.16b, v16.16b, v22.16b
+ ushr v22.4s, v24.4s, #8
+ shl v24.4s, v24.4s, #24
+ eor v25.16b, v17.16b, v25.16b
+ orr v22.16b, v24.16b, v22.16b
+ ushr v24.4s, v21.4s, #8
+ shl v21.4s, v21.4s, #24
+ orr v21.16b, v21.16b, v24.16b
+ ushr v24.4s, v25.4s, #8
+ shl v25.4s, v25.4s, #24
+ add v1.4s, v16.4s, v1.4s
+ orr v24.16b, v25.16b, v24.16b
+ add v5.4s, v22.4s, v5.4s
+ eor v25.16b, v1.16b, v28.16b
+ add v6.4s, v21.4s, v6.4s
+ eor v4.16b, v5.16b, v4.16b
+ ushr v27.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ add v2.4s, v24.4s, v2.4s
+ eor v3.16b, v6.16b, v3.16b
+ orr v25.16b, v25.16b, v27.16b
+ ushr v27.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ ldur q19, [x29, #-176]
+ eor v26.16b, v2.16b, v29.16b
+ orr v4.16b, v4.16b, v27.16b
+ ushr v27.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ orr v3.16b, v3.16b, v27.16b
+ ushr v27.4s, v26.4s, #7
+ shl v26.4s, v26.4s, #25
+ add v20.4s, v20.4s, v18.4s
+ add v17.4s, v17.4s, v30.4s
+ orr v26.16b, v26.16b, v27.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v0.4s, v0.4s, v19.4s
+ add v19.4s, v20.4s, v26.4s
+ add v17.4s, v17.4s, v25.4s
+ eor v20.16b, v19.16b, v22.16b
+ add v7.4s, v19.4s, v7.4s
+ eor v19.16b, v17.16b, v21.16b
+ ldur q21, [x29, #-64]
+ add v23.4s, v23.4s, v4.4s
+ eor v24.16b, v23.16b, v24.16b
+ rev32 v16.8h, v16.8h
+ add v17.4s, v17.4s, v21.4s
+ rev32 v21.8h, v24.8h
+ add v6.4s, v6.4s, v21.4s
+ rev32 v20.8h, v20.8h
+ add v2.4s, v2.4s, v16.4s
+ eor v4.16b, v6.16b, v4.16b
+ rev32 v19.8h, v19.8h
+ add v1.4s, v1.4s, v20.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v24.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v5.4s, v5.4s, v19.4s
+ eor v22.16b, v1.16b, v26.16b
+ orr v4.16b, v4.16b, v24.16b
+ ushr v24.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v18.4s, v23.4s, v14.4s
+ eor v23.16b, v5.16b, v25.16b
+ orr v3.16b, v3.16b, v24.16b
+ ushr v24.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ orr v22.16b, v22.16b, v24.16b
+ ushr v24.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v24.16b
+ add v18.4s, v18.4s, v4.4s
+ add v0.4s, v0.4s, v3.4s
+ add v24.4s, v17.4s, v23.4s
+ eor v17.16b, v18.16b, v21.16b
+ add v7.4s, v7.4s, v22.4s
+ eor v16.16b, v0.16b, v16.16b
+ ushr v21.4s, v17.4s, #8
+ shl v17.4s, v17.4s, #24
+ eor v20.16b, v7.16b, v20.16b
+ orr v21.16b, v17.16b, v21.16b
+ ushr v17.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v19.16b, v24.16b, v19.16b
+ orr v16.16b, v16.16b, v17.16b
+ ushr v17.4s, v20.4s, #8
+ shl v20.4s, v20.4s, #24
+ orr v25.16b, v20.16b, v17.16b
+ ushr v17.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v17.16b
+ add v1.4s, v25.4s, v1.4s
+ eor v22.16b, v1.16b, v22.16b
+ eor v20.16b, v1.16b, v18.16b
+ add v1.4s, v19.4s, v5.4s
+ eor v26.16b, v1.16b, v0.16b
+ add v0.4s, v21.4s, v6.4s
+ eor v5.16b, v1.16b, v23.16b
+ eor v1.16b, v0.16b, v4.16b
+ eor v17.16b, v0.16b, v7.16b
+ add v0.4s, v16.4s, v2.4s
+ eor v2.16b, v0.16b, v3.16b
+ eor v6.16b, v0.16b, v24.16b
+ ushr v0.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v0.16b, v1.16b, v0.16b
+ ushr v1.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v2.16b, v1.16b
+ ushr v2.4s, v22.4s, #7
+ shl v3.4s, v22.4s, #25
+ orr v2.16b, v3.16b, v2.16b
+ ushr v3.4s, v5.4s, #7
+ shl v4.4s, v5.4s, #25
+ orr v3.16b, v4.16b, v3.16b
+ eor v8.16b, v16.16b, v3.16b
+ eor v9.16b, v25.16b, v0.16b
+ eor v31.16b, v1.16b, v19.16b
+ cmp x17, x22
+ eor v15.16b, v2.16b, v21.16b
+ mov w18, w19
+ b.ne .LBB2_4
+.LBB2_7:
+ zip1 v0.4s, v20.4s, v26.4s
+ zip2 v1.4s, v20.4s, v26.4s
+ zip1 v2.4s, v17.4s, v6.4s
+ zip2 v3.4s, v17.4s, v6.4s
+ zip1 v4.4s, v8.4s, v9.4s
+ zip2 v5.4s, v8.4s, v9.4s
+ zip1 v6.4s, v31.4s, v15.4s
+ zip2 v7.4s, v31.4s, v15.4s
+ add x13, x20, #4
+ tst w5, #0x1
+ sub x28, x28, #4
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v0.2d, v0.2d, v2.2d
+ zip1 v2.2d, v1.2d, v3.2d
+ zip2 v1.2d, v1.2d, v3.2d
+ zip1 v3.2d, v4.2d, v6.2d
+ zip2 v4.2d, v4.2d, v6.2d
+ zip1 v6.2d, v5.2d, v7.2d
+ zip2 v5.2d, v5.2d, v7.2d
+ add x24, x24, #32
+ csel x20, x13, x20, ne
+ cmp x28, #3
+ stp q16, q3, [x26]
+ stp q0, q4, [x26, #32]
+ stp q2, q6, [x26, #64]
+ stp q1, q5, [x26, #96]
+ add x26, x26, #128
+ b.hi .LBB2_2
+.LBB2_8:
+ cbz x28, .LBB2_16
+ orr w8, w7, w19
+ and x21, x5, #0x1
+ stur w8, [x29, #-64]
+.LBB2_10:
+ ldr x8, [sp, #40]
+ ldr x25, [x24]
+ ldur w4, [x29, #-64]
+ ldp q1, q0, [x8]
+ mov x8, x22
+ stp q1, q0, [x29, #-48]
+.LBB2_11:
+ subs x23, x8, #1
+ b.eq .LBB2_13
+ cbnz x8, .LBB2_14
+ b .LBB2_15
+.LBB2_13:
+ orr w4, w4, w27
+.LBB2_14:
+ sub x0, x29, #48
+ mov w2, #64
+ mov x1, x25
+ mov x3, x20
+ bl zfs_blake3_compress_in_place_sse2
+ add x25, x25, #64
+ mov x8, x23
+ mov w4, w19
+ b .LBB2_11
+.LBB2_15:
+ ldp q0, q1, [x29, #-48]
+ add x20, x20, x21
+ add x24, x24, #8
+ subs x28, x28, #1
+ stp q0, q1, [x26], #32
+ b.ne .LBB2_10
+.LBB2_16:
+ add sp, sp, #384
+ ldp x20, x19, [sp, #144]
+ ldp x22, x21, [sp, #128]
+ ldp x24, x23, [sp, #112]
+ ldp x26, x25, [sp, #96]
+ ldp x28, x27, [sp, #80]
+ ldp x29, x30, [sp, #64]
+ ldp d9, d8, [sp, #48]
+ ldp d11, d10, [sp, #32]
+ ldp d13, d12, [sp, #16]
+ ldp d15, d14, [sp], #160
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
new file mode 100644
index 000000000..eb6946400
--- /dev/null
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -0,0 +1,2463 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE4.1 -> ARMv8-A
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if defined(__aarch64__)
+ .text
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI0_1:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI0_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI0_3:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 20
+ .byte 21
+ .byte 22
+ .byte 23
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+.LCPI0_4:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 4
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+ .cfi_startproc
+ ldp q7, q6, [x0]
+ ldp q17, q18, [x1]
+ add x12, x1, #32
+ ld2 { v4.4s, v5.4s }, [x12]
+ lsr x10, x3, #32
+ fmov s16, w3
+ adrp x13, .LCPI0_0
+ adrp x11, .LCPI0_1
+ and w8, w2, #0xff
+ mov v16.s[1], w10
+ ldr q0, [x13, :lo12:.LCPI0_0]
+ ldr q20, [x11, :lo12:.LCPI0_1]
+ adrp x11, .LCPI0_4
+ and w9, w4, #0xff
+ ldr q2, [x11, :lo12:.LCPI0_4]
+ mov v16.s[2], w8
+ uzp1 v21.4s, v17.4s, v18.4s
+ add v7.4s, v6.4s, v7.4s
+ adrp x12, .LCPI0_3
+ mov v16.s[3], w9
+ uzp2 v18.4s, v17.4s, v18.4s
+ add v7.4s, v7.4s, v21.4s
+ ext v17.16b, v5.16b, v5.16b, #12
+ ldr q3, [x12, :lo12:.LCPI0_3]
+ ext v24.16b, v4.16b, v4.16b, #12
+ eor v16.16b, v7.16b, v16.16b
+ mov v27.16b, v17.16b
+ uzp1 v19.4s, v21.4s, v21.4s
+ ext v25.16b, v21.16b, v21.16b, #12
+ zip2 v28.4s, v18.4s, v17.4s
+ tbl v29.16b, { v16.16b }, v0.16b
+ mov v27.s[1], v24.s[2]
+ zip1 v23.2d, v17.2d, v18.2d
+ ext v19.16b, v19.16b, v21.16b, #8
+ add v22.4s, v29.4s, v20.4s
+ ext v26.16b, v21.16b, v25.16b, #12
+ tbl v20.16b, { v23.16b, v24.16b }, v2.16b
+ zip1 v21.4s, v28.4s, v24.4s
+ zip1 v23.4s, v24.4s, v28.4s
+ uzp2 v19.4s, v19.4s, v18.4s
+ eor v24.16b, v22.16b, v6.16b
+ ext v25.16b, v20.16b, v20.16b, #12
+ ext v6.16b, v23.16b, v21.16b, #8
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v16.16b, { v26.16b, v27.16b }, v3.16b
+ uzp1 v21.4s, v20.4s, v25.4s
+ mov v26.16b, v6.16b
+ ext v23.16b, v18.16b, v18.16b, #12
+ mov v26.s[1], v21.s[2]
+ adrp x10, .LCPI0_2
+ ext v25.16b, v18.16b, v23.16b, #12
+ uzp1 v23.4s, v18.4s, v18.4s
+ ldr q1, [x10, :lo12:.LCPI0_2]
+ ext v18.16b, v23.16b, v18.16b, #8
+ ushr v23.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ orr v23.16b, v24.16b, v23.16b
+ add v7.4s, v7.4s, v23.4s
+ eor v27.16b, v29.16b, v7.16b
+ add v4.4s, v7.4s, v4.4s
+ tbl v7.16b, { v25.16b, v26.16b }, v3.16b
+ tbl v26.16b, { v27.16b }, v1.16b
+ add v22.4s, v22.4s, v26.4s
+ uzp2 v18.4s, v18.4s, v16.4s
+ eor v23.16b, v23.16b, v22.16b
+ ext v5.16b, v18.16b, v18.16b, #4
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ uzp1 v25.4s, v5.4s, v5.4s
+ orr v23.16b, v23.16b, v27.16b
+ ext v28.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v25.16b, v5.16b, #8
+ ext v25.16b, v26.16b, v26.16b, #8
+ add v26.4s, v28.4s, v23.4s
+ eor v25.16b, v26.16b, v25.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v22.4s, v22.4s, v25.4s
+ eor v23.16b, v23.16b, v22.16b
+ add v17.4s, v26.4s, v17.4s
+ ushr v26.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v26.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v25.16b, v25.16b, v17.16b
+ add v17.4s, v17.4s, v19.4s
+ tbl v19.16b, { v25.16b }, v1.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ ext v17.16b, v17.16b, v17.16b, #4
+ orr v23.16b, v23.16b, v25.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v17.4s, v17.4s, v23.4s
+ eor v19.16b, v17.16b, v19.16b
+ ext v22.16b, v22.16b, v22.16b, #12
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v17.4s, v17.4s, v16.4s
+ orr v23.16b, v23.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ ext v25.16b, v17.16b, v17.16b, #12
+ eor v17.16b, v19.16b, v17.16b
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v19.4s, v22.4s, v17.4s
+ eor v22.16b, v23.16b, v19.16b
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v6.2d, v16.2d
+ ushr v23.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ zip2 v24.4s, v16.4s, v6.4s
+ tbl v26.16b, { v20.16b, v21.16b }, v2.16b
+ orr v22.16b, v22.16b, v23.16b
+ zip1 v16.4s, v24.4s, v21.4s
+ zip1 v20.4s, v21.4s, v24.4s
+ ext v21.16b, v26.16b, v26.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v25.4s, v25.4s, v22.4s
+ ext v16.16b, v20.16b, v16.16b, #8
+ uzp1 v21.4s, v26.4s, v21.4s
+ eor v26.16b, v25.16b, v17.16b
+ ext v19.16b, v19.16b, v19.16b, #4
+ tbl v26.16b, { v26.16b }, v0.16b
+ mov v29.16b, v16.16b
+ add v19.4s, v19.4s, v26.4s
+ ext v27.16b, v5.16b, v5.16b, #12
+ mov v29.s[1], v21.s[2]
+ eor v22.16b, v22.16b, v19.16b
+ ext v28.16b, v5.16b, v27.16b, #12
+ ushr v27.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v25.4s, v6.4s
+ orr v22.16b, v22.16b, v27.16b
+ add v6.4s, v6.4s, v22.4s
+ eor v26.16b, v26.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ tbl v18.16b, { v26.16b }, v1.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v22.16b, v22.16b, v26.16b
+ ext v18.16b, v18.16b, v18.16b, #8
+ add v6.4s, v6.4s, v22.4s
+ eor v18.16b, v6.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v6.4s, v7.4s
+ orr v22.16b, v22.16b, v26.16b
+ add v6.4s, v6.4s, v22.4s
+ ext v26.16b, v6.16b, v6.16b, #12
+ eor v6.16b, v18.16b, v6.16b
+ uzp2 v4.4s, v4.4s, v7.4s
+ zip2 v25.4s, v7.4s, v16.4s
+ add v26.4s, v26.4s, v21.4s
+ zip1 v20.2d, v16.2d, v7.2d
+ tbl v6.16b, { v6.16b }, v1.16b
+ ext v24.16b, v4.16b, v4.16b, #4
+ tbl v27.16b, { v20.16b, v21.16b }, v2.16b
+ zip1 v7.4s, v25.4s, v21.4s
+ zip1 v20.4s, v21.4s, v25.4s
+ add v18.4s, v19.4s, v6.4s
+ uzp1 v5.4s, v24.4s, v24.4s
+ ext v21.16b, v27.16b, v27.16b, #12
+ ext v7.16b, v20.16b, v7.16b, #8
+ eor v19.16b, v22.16b, v18.16b
+ ext v5.16b, v5.16b, v24.16b, #8
+ tbl v17.16b, { v28.16b, v29.16b }, v3.16b
+ uzp1 v21.4s, v27.4s, v21.4s
+ mov v28.16b, v7.16b
+ ushr v22.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v23.16b, v24.16b, v24.16b, #12
+ uzp2 v5.4s, v5.4s, v17.4s
+ mov v28.s[1], v21.s[2]
+ orr v19.16b, v19.16b, v22.16b
+ ext v27.16b, v24.16b, v23.16b, #12
+ ext v23.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v25.16b, v18.16b, v18.16b, #4
+ add v18.4s, v26.4s, v19.4s
+ uzp1 v24.4s, v23.4s, v23.4s
+ eor v6.16b, v18.16b, v6.16b
+ ext v24.16b, v24.16b, v23.16b, #8
+ add v16.4s, v18.4s, v16.4s
+ tbl v18.16b, { v27.16b, v28.16b }, v3.16b
+ tbl v27.16b, { v6.16b }, v0.16b
+ uzp2 v6.4s, v24.4s, v18.4s
+ add v24.4s, v25.4s, v27.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v25.16b
+ add v16.4s, v16.4s, v19.4s
+ eor v25.16b, v27.16b, v16.16b
+ add v4.4s, v16.4s, v4.4s
+ tbl v16.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v16.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v16.16b, v16.16b, v16.16b, #8
+ add v4.4s, v4.4s, v19.4s
+ eor v16.16b, v4.16b, v16.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v25.16b, { v16.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v16.16b, v19.16b, v24.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v17.4s
+ orr v19.16b, v16.16b, v19.16b
+ add v27.4s, v4.4s, v19.4s
+ eor v25.16b, v25.16b, v27.16b
+ tbl v25.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v25.4s
+ zip2 v26.4s, v17.4s, v7.4s
+ ext v4.16b, v27.16b, v27.16b, #12
+ eor v19.16b, v19.16b, v24.16b
+ add v28.4s, v4.4s, v21.4s
+ zip1 v20.2d, v7.2d, v17.2d
+ zip1 v4.4s, v26.4s, v21.4s
+ zip1 v17.4s, v21.4s, v26.4s
+ ushr v26.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v26.16b
+ ext v25.16b, v25.16b, v25.16b, #8
+ add v27.4s, v28.4s, v19.4s
+ eor v25.16b, v27.16b, v25.16b
+ ext v24.16b, v24.16b, v24.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v19.16b, v19.16b, v24.16b
+ add v7.4s, v27.4s, v7.4s
+ ushr v27.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v27.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v25.16b, v25.16b, v7.16b
+ add v5.4s, v7.4s, v5.4s
+ tbl v7.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v7.16b, v7.16b, v7.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ tbl v16.16b, { v20.16b, v21.16b }, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ orr v19.16b, v19.16b, v25.16b
+ ext v20.16b, v16.16b, v16.16b, #12
+ ext v4.16b, v17.16b, v4.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ uzp1 v21.4s, v16.4s, v20.4s
+ mov v17.16b, v4.16b
+ ext v25.16b, v5.16b, v5.16b, #12
+ mov v17.s[1], v21.s[2]
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v4.2d, v18.2d
+ ext v22.16b, v23.16b, v23.16b, #12
+ zip2 v26.4s, v18.4s, v4.4s
+ tbl v18.16b, { v20.16b, v21.16b }, v2.16b
+ eor v5.16b, v7.16b, v5.16b
+ ext v16.16b, v23.16b, v22.16b, #12
+ ext v22.16b, v6.16b, v6.16b, #4
+ zip1 v27.4s, v26.4s, v21.4s
+ zip1 v20.4s, v21.4s, v26.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ tbl v5.16b, { v5.16b }, v1.16b
+ ext v20.16b, v20.16b, v27.16b, #8
+ uzp1 v27.4s, v18.4s, v21.4s
+ uzp1 v18.4s, v22.4s, v22.4s
+ add v21.4s, v24.4s, v5.4s
+ ext v18.16b, v18.16b, v22.16b, #8
+ eor v19.16b, v19.16b, v21.16b
+ tbl v7.16b, { v16.16b, v17.16b }, v3.16b
+ uzp2 v18.4s, v18.4s, v17.4s
+ zip2 v16.4s, v16.4s, v20.4s
+ ushr v17.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v17.16b, v19.16b, v17.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v19.4s, v25.4s, v17.4s
+ eor v5.16b, v19.16b, v5.16b
+ ext v21.16b, v21.16b, v21.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v4.4s, v19.4s, v4.4s
+ add v19.4s, v21.4s, v5.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v21.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v17.16b, v17.16b, v21.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ add v6.4s, v19.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v17.16b, v19.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v7.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ mov v29.16b, v20.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ mov v29.s[1], v27.s[2]
+ add v4.4s, v4.4s, v27.4s
+ zip1 v26.2d, v20.2d, v7.2d
+ zip1 v7.4s, v16.4s, v27.4s
+ zip1 v16.4s, v27.4s, v16.4s
+ eor v17.16b, v17.16b, v6.16b
+ ext v7.16b, v16.16b, v7.16b, #8
+ ushr v16.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v16.16b, v17.16b, v16.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ ushr v17.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v20.4s
+ orr v16.16b, v16.16b, v17.16b
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v17.4s, v16.4s, #7
+ shl v16.4s, v16.4s, #25
+ ext v23.16b, v22.16b, v22.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v16.16b, v16.16b, v17.16b
+ ext v28.16b, v22.16b, v23.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v16.4s, v4.4s
+ tbl v3.16b, { v28.16b, v29.16b }, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v4.4s, v3.4s
+ tbl v4.16b, { v5.16b }, v0.16b
+ add v5.4s, v6.4s, v4.4s
+ eor v6.16b, v16.16b, v5.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v16.16b
+ tbl v2.16b, { v26.16b, v27.16b }, v2.16b
+ add v3.4s, v3.4s, v6.4s
+ ext v19.16b, v2.16b, v2.16b, #12
+ eor v4.16b, v4.16b, v3.16b
+ uzp1 v2.4s, v2.4s, v19.4s
+ ext v3.16b, v3.16b, v3.16b, #12
+ tbl v4.16b, { v4.16b }, v1.16b
+ add v2.4s, v3.4s, v2.4s
+ add v3.4s, v5.4s, v4.4s
+ eor v5.16b, v6.16b, v3.16b
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v6.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ add v2.4s, v2.4s, v5.4s
+ eor v4.16b, v2.16b, v4.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ tbl v0.16b, { v4.16b }, v0.16b
+ add v3.4s, v3.4s, v0.4s
+ eor v4.16b, v5.16b, v3.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v2.4s, v2.4s, v7.4s
+ orr v4.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v4.4s
+ eor v0.16b, v0.16b, v2.16b
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v3.4s, v0.4s
+ eor v3.16b, v4.16b, v1.16b
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v1.16b, v1.16b, v1.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v1.16b, v2.16b, v1.16b
+ orr v2.16b, v3.16b, v4.16b
+ eor v0.16b, v2.16b, v0.16b
+ stp q1, q0, [x0]
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI1_1:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI1_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI1_3:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 20
+ .byte 21
+ .byte 22
+ .byte 23
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+.LCPI1_4:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 4
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+ .text
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+ .cfi_startproc
+ ldp q7, q6, [x0]
+ ldp q17, q18, [x1]
+ add x12, x1, #32
+ ld2 { v4.4s, v5.4s }, [x12]
+ lsr x10, x3, #32
+ fmov s16, w3
+ adrp x13, .LCPI1_0
+ adrp x11, .LCPI1_1
+ and w8, w2, #0xff
+ mov v16.s[1], w10
+ ldr q0, [x13, :lo12:.LCPI1_0]
+ ldr q20, [x11, :lo12:.LCPI1_1]
+ adrp x11, .LCPI1_4
+ and w9, w4, #0xff
+ ldr q2, [x11, :lo12:.LCPI1_4]
+ mov v16.s[2], w8
+ uzp1 v21.4s, v17.4s, v18.4s
+ add v7.4s, v6.4s, v7.4s
+ adrp x12, .LCPI1_3
+ mov v16.s[3], w9
+ uzp2 v18.4s, v17.4s, v18.4s
+ add v7.4s, v7.4s, v21.4s
+ ext v17.16b, v5.16b, v5.16b, #12
+ ldr q3, [x12, :lo12:.LCPI1_3]
+ ext v24.16b, v4.16b, v4.16b, #12
+ eor v16.16b, v7.16b, v16.16b
+ mov v27.16b, v17.16b
+ uzp1 v19.4s, v21.4s, v21.4s
+ ext v25.16b, v21.16b, v21.16b, #12
+ zip2 v28.4s, v18.4s, v17.4s
+ tbl v29.16b, { v16.16b }, v0.16b
+ mov v27.s[1], v24.s[2]
+ zip1 v23.2d, v17.2d, v18.2d
+ ext v19.16b, v19.16b, v21.16b, #8
+ add v22.4s, v29.4s, v20.4s
+ ext v26.16b, v21.16b, v25.16b, #12
+ tbl v20.16b, { v23.16b, v24.16b }, v2.16b
+ zip1 v21.4s, v28.4s, v24.4s
+ zip1 v23.4s, v24.4s, v28.4s
+ uzp2 v19.4s, v19.4s, v18.4s
+ eor v24.16b, v22.16b, v6.16b
+ ext v25.16b, v20.16b, v20.16b, #12
+ ext v6.16b, v23.16b, v21.16b, #8
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v16.16b, { v26.16b, v27.16b }, v3.16b
+ uzp1 v21.4s, v20.4s, v25.4s
+ mov v26.16b, v6.16b
+ ext v23.16b, v18.16b, v18.16b, #12
+ mov v26.s[1], v21.s[2]
+ adrp x10, .LCPI1_2
+ ext v25.16b, v18.16b, v23.16b, #12
+ uzp1 v23.4s, v18.4s, v18.4s
+ ldr q1, [x10, :lo12:.LCPI1_2]
+ ext v18.16b, v23.16b, v18.16b, #8
+ ushr v23.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ orr v23.16b, v24.16b, v23.16b
+ add v7.4s, v7.4s, v23.4s
+ eor v27.16b, v29.16b, v7.16b
+ add v4.4s, v7.4s, v4.4s
+ tbl v7.16b, { v25.16b, v26.16b }, v3.16b
+ tbl v26.16b, { v27.16b }, v1.16b
+ add v22.4s, v22.4s, v26.4s
+ uzp2 v18.4s, v18.4s, v16.4s
+ eor v23.16b, v23.16b, v22.16b
+ ext v5.16b, v18.16b, v18.16b, #4
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ uzp1 v25.4s, v5.4s, v5.4s
+ orr v23.16b, v23.16b, v27.16b
+ ext v28.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v25.16b, v5.16b, #8
+ ext v25.16b, v26.16b, v26.16b, #8
+ add v26.4s, v28.4s, v23.4s
+ eor v25.16b, v26.16b, v25.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v22.4s, v22.4s, v25.4s
+ eor v23.16b, v23.16b, v22.16b
+ add v17.4s, v26.4s, v17.4s
+ ushr v26.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v26.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v25.16b, v25.16b, v17.16b
+ add v17.4s, v17.4s, v19.4s
+ tbl v19.16b, { v25.16b }, v1.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ ext v17.16b, v17.16b, v17.16b, #4
+ orr v23.16b, v23.16b, v25.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v17.4s, v17.4s, v23.4s
+ eor v19.16b, v17.16b, v19.16b
+ ext v22.16b, v22.16b, v22.16b, #12
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v17.4s, v17.4s, v16.4s
+ orr v23.16b, v23.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ ext v25.16b, v17.16b, v17.16b, #12
+ eor v17.16b, v19.16b, v17.16b
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v19.4s, v22.4s, v17.4s
+ eor v22.16b, v23.16b, v19.16b
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v6.2d, v16.2d
+ ushr v23.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ zip2 v24.4s, v16.4s, v6.4s
+ tbl v26.16b, { v20.16b, v21.16b }, v2.16b
+ orr v22.16b, v22.16b, v23.16b
+ zip1 v16.4s, v24.4s, v21.4s
+ zip1 v20.4s, v21.4s, v24.4s
+ ext v21.16b, v26.16b, v26.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v25.4s, v25.4s, v22.4s
+ ext v16.16b, v20.16b, v16.16b, #8
+ uzp1 v21.4s, v26.4s, v21.4s
+ eor v26.16b, v25.16b, v17.16b
+ ext v19.16b, v19.16b, v19.16b, #4
+ tbl v26.16b, { v26.16b }, v0.16b
+ mov v29.16b, v16.16b
+ add v19.4s, v19.4s, v26.4s
+ ext v27.16b, v5.16b, v5.16b, #12
+ mov v29.s[1], v21.s[2]
+ eor v22.16b, v22.16b, v19.16b
+ ext v28.16b, v5.16b, v27.16b, #12
+ ushr v27.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v25.4s, v6.4s
+ orr v22.16b, v22.16b, v27.16b
+ add v6.4s, v6.4s, v22.4s
+ eor v26.16b, v26.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ tbl v18.16b, { v26.16b }, v1.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v22.16b, v22.16b, v26.16b
+ ext v18.16b, v18.16b, v18.16b, #8
+ add v6.4s, v6.4s, v22.4s
+ eor v18.16b, v6.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v6.4s, v7.4s
+ orr v22.16b, v22.16b, v26.16b
+ add v6.4s, v6.4s, v22.4s
+ ext v26.16b, v6.16b, v6.16b, #12
+ eor v6.16b, v18.16b, v6.16b
+ uzp2 v4.4s, v4.4s, v7.4s
+ zip2 v25.4s, v7.4s, v16.4s
+ add v26.4s, v26.4s, v21.4s
+ zip1 v20.2d, v16.2d, v7.2d
+ tbl v6.16b, { v6.16b }, v1.16b
+ ext v24.16b, v4.16b, v4.16b, #4
+ tbl v27.16b, { v20.16b, v21.16b }, v2.16b
+ zip1 v7.4s, v25.4s, v21.4s
+ zip1 v20.4s, v21.4s, v25.4s
+ add v18.4s, v19.4s, v6.4s
+ uzp1 v5.4s, v24.4s, v24.4s
+ ext v21.16b, v27.16b, v27.16b, #12
+ ext v7.16b, v20.16b, v7.16b, #8
+ eor v19.16b, v22.16b, v18.16b
+ ext v5.16b, v5.16b, v24.16b, #8
+ tbl v17.16b, { v28.16b, v29.16b }, v3.16b
+ uzp1 v21.4s, v27.4s, v21.4s
+ mov v28.16b, v7.16b
+ ushr v22.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v23.16b, v24.16b, v24.16b, #12
+ uzp2 v5.4s, v5.4s, v17.4s
+ mov v28.s[1], v21.s[2]
+ orr v19.16b, v19.16b, v22.16b
+ ext v27.16b, v24.16b, v23.16b, #12
+ ext v23.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v25.16b, v18.16b, v18.16b, #4
+ add v18.4s, v26.4s, v19.4s
+ uzp1 v24.4s, v23.4s, v23.4s
+ eor v6.16b, v18.16b, v6.16b
+ ext v24.16b, v24.16b, v23.16b, #8
+ add v16.4s, v18.4s, v16.4s
+ tbl v18.16b, { v27.16b, v28.16b }, v3.16b
+ tbl v27.16b, { v6.16b }, v0.16b
+ uzp2 v6.4s, v24.4s, v18.4s
+ add v24.4s, v25.4s, v27.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v25.16b
+ add v16.4s, v16.4s, v19.4s
+ eor v25.16b, v27.16b, v16.16b
+ add v4.4s, v16.4s, v4.4s
+ tbl v16.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v16.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v16.16b, v16.16b, v16.16b, #8
+ add v4.4s, v4.4s, v19.4s
+ eor v16.16b, v4.16b, v16.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v25.16b, { v16.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v16.16b, v19.16b, v24.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v17.4s
+ orr v19.16b, v16.16b, v19.16b
+ add v27.4s, v4.4s, v19.4s
+ eor v25.16b, v25.16b, v27.16b
+ tbl v25.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v25.4s
+ zip2 v26.4s, v17.4s, v7.4s
+ ext v4.16b, v27.16b, v27.16b, #12
+ eor v19.16b, v19.16b, v24.16b
+ add v28.4s, v4.4s, v21.4s
+ zip1 v20.2d, v7.2d, v17.2d
+ zip1 v4.4s, v26.4s, v21.4s
+ zip1 v17.4s, v21.4s, v26.4s
+ ushr v26.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v26.16b
+ ext v25.16b, v25.16b, v25.16b, #8
+ add v27.4s, v28.4s, v19.4s
+ eor v25.16b, v27.16b, v25.16b
+ ext v24.16b, v24.16b, v24.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v19.16b, v19.16b, v24.16b
+ add v7.4s, v27.4s, v7.4s
+ ushr v27.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v27.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v25.16b, v25.16b, v7.16b
+ add v5.4s, v7.4s, v5.4s
+ tbl v7.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v7.16b, v7.16b, v7.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ tbl v16.16b, { v20.16b, v21.16b }, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ orr v19.16b, v19.16b, v25.16b
+ ext v20.16b, v16.16b, v16.16b, #12
+ ext v4.16b, v17.16b, v4.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ uzp1 v21.4s, v16.4s, v20.4s
+ mov v17.16b, v4.16b
+ ext v25.16b, v5.16b, v5.16b, #12
+ mov v17.s[1], v21.s[2]
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v4.2d, v18.2d
+ ext v22.16b, v23.16b, v23.16b, #12
+ zip2 v26.4s, v18.4s, v4.4s
+ tbl v18.16b, { v20.16b, v21.16b }, v2.16b
+ eor v5.16b, v7.16b, v5.16b
+ ext v16.16b, v23.16b, v22.16b, #12
+ ext v22.16b, v6.16b, v6.16b, #4
+ zip1 v27.4s, v26.4s, v21.4s
+ zip1 v20.4s, v21.4s, v26.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ tbl v5.16b, { v5.16b }, v1.16b
+ ext v20.16b, v20.16b, v27.16b, #8
+ uzp1 v27.4s, v18.4s, v21.4s
+ uzp1 v18.4s, v22.4s, v22.4s
+ add v21.4s, v24.4s, v5.4s
+ ext v18.16b, v18.16b, v22.16b, #8
+ eor v19.16b, v19.16b, v21.16b
+ tbl v7.16b, { v16.16b, v17.16b }, v3.16b
+ uzp2 v18.4s, v18.4s, v17.4s
+ zip2 v16.4s, v16.4s, v20.4s
+ ushr v17.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v17.16b, v19.16b, v17.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v19.4s, v25.4s, v17.4s
+ eor v5.16b, v19.16b, v5.16b
+ ext v21.16b, v21.16b, v21.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v4.4s, v19.4s, v4.4s
+ add v19.4s, v21.4s, v5.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v21.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v17.16b, v17.16b, v21.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ add v6.4s, v19.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v17.16b, v19.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v7.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ mov v29.16b, v20.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ mov v29.s[1], v27.s[2]
+ add v4.4s, v4.4s, v27.4s
+ zip1 v26.2d, v20.2d, v7.2d
+ zip1 v7.4s, v16.4s, v27.4s
+ zip1 v16.4s, v27.4s, v16.4s
+ eor v17.16b, v17.16b, v6.16b
+ ext v7.16b, v16.16b, v7.16b, #8
+ ushr v16.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v16.16b, v17.16b, v16.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ ushr v17.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v20.4s
+ orr v16.16b, v16.16b, v17.16b
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v17.4s, v16.4s, #7
+ shl v16.4s, v16.4s, #25
+ ext v23.16b, v22.16b, v22.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v16.16b, v16.16b, v17.16b
+ ext v28.16b, v22.16b, v23.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v16.4s, v4.4s
+ tbl v3.16b, { v28.16b, v29.16b }, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v4.4s, v3.4s
+ tbl v4.16b, { v5.16b }, v0.16b
+ add v5.4s, v6.4s, v4.4s
+ eor v6.16b, v16.16b, v5.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v16.16b
+ tbl v2.16b, { v26.16b, v27.16b }, v2.16b
+ add v3.4s, v3.4s, v6.4s
+ ext v19.16b, v2.16b, v2.16b, #12
+ eor v4.16b, v4.16b, v3.16b
+ uzp1 v2.4s, v2.4s, v19.4s
+ ext v3.16b, v3.16b, v3.16b, #12
+ tbl v4.16b, { v4.16b }, v1.16b
+ add v2.4s, v3.4s, v2.4s
+ add v3.4s, v5.4s, v4.4s
+ eor v5.16b, v6.16b, v3.16b
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v6.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ add v2.4s, v2.4s, v5.4s
+ eor v4.16b, v2.16b, v4.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ tbl v0.16b, { v4.16b }, v0.16b
+ add v3.4s, v3.4s, v0.4s
+ eor v4.16b, v5.16b, v3.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v2.4s, v2.4s, v7.4s
+ orr v4.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v4.4s
+ eor v0.16b, v0.16b, v2.16b
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v3.4s, v0.4s
+ eor v3.16b, v4.16b, v1.16b
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v0.16b, v0.16b, v0.16b, #8
+ ext v1.16b, v1.16b, v1.16b, #12
+ orr v3.16b, v3.16b, v4.16b
+ eor v2.16b, v2.16b, v1.16b
+ eor v3.16b, v3.16b, v0.16b
+ stp q2, q3, [x5]
+ ldr q2, [x0]
+ eor v1.16b, v2.16b, v1.16b
+ str q1, [x5, #32]
+ ldr q1, [x0, #16]
+ eor v0.16b, v1.16b, v0.16b
+ str q0, [x5, #48]
+ ret
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI2_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+.LCPI2_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI2_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+ .text
+ .globl zfs_blake3_hash_many_sse41
+ .p2align 2
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
+ .cfi_startproc
+ stp d15, d14, [sp, #-160]!
+ stp d13, d12, [sp, #16]
+ stp d11, d10, [sp, #32]
+ stp d9, d8, [sp, #48]
+ stp x29, x30, [sp, #64]
+ stp x28, x27, [sp, #80]
+ stp x26, x25, [sp, #96]
+ stp x24, x23, [sp, #112]
+ stp x22, x21, [sp, #128]
+ stp x20, x19, [sp, #144]
+ mov x29, sp
+ sub sp, sp, #448
+ .cfi_def_cfa w29, 160
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w21, -24
+ .cfi_offset w22, -32
+ .cfi_offset w23, -40
+ .cfi_offset w24, -48
+ .cfi_offset w25, -56
+ .cfi_offset w26, -64
+ .cfi_offset w27, -72
+ .cfi_offset w28, -80
+ .cfi_offset w30, -88
+ .cfi_offset w29, -96
+ .cfi_offset b8, -104
+ .cfi_offset b9, -112
+ .cfi_offset b10, -120
+ .cfi_offset b11, -128
+ .cfi_offset b12, -136
+ .cfi_offset b13, -144
+ .cfi_offset b14, -152
+ .cfi_offset b15, -160
+ ldr x26, [x29, #168]
+ ldrb w27, [x29, #160]
+ mov w19, w6
+ mov x20, x4
+ mov x22, x2
+ mov x28, x1
+ cmp x1, #4
+ mov x24, x0
+ str x3, [sp, #40]
+ b.lo .LBB2_8
+ adrp x11, .LCPI2_0
+ ldr q0, [x11, :lo12:.LCPI2_0]
+ sbfx w13, w5, #0, #1
+ dup v1.4s, w13
+ mov w10, #58983
+ mov w11, #44677
+ mov w12, #62322
+ and v0.16b, v1.16b, v0.16b
+ mov w13, #62778
+ orr w8, w7, w19
+ adrp x9, .LCPI2_1
+ movk w10, #27145, lsl #16
+ movk w11, #47975, lsl #16
+ movk w12, #15470, lsl #16
+ movk w13, #42319, lsl #16
+ str q0, [sp, #16]
+ orr v0.4s, #128, lsl #24
+ adrp x14, .LCPI2_2
+ str q0, [sp]
+.LBB2_2:
+ ldr x2, [sp, #40]
+ mov x15, x2
+ ld1r { v7.4s }, [x15], #4
+ add x16, x2, #8
+ add x17, x2, #12
+ add x18, x2, #16
+ add x0, x2, #20
+ add x3, x2, #24
+ add x2, x2, #28
+ ld1r { v6.4s }, [x16]
+ ld1r { v17.4s }, [x17]
+ ld1r { v10.4s }, [x18]
+ ld1r { v11.4s }, [x0]
+ ld1r { v19.4s }, [x3]
+ ld1r { v18.4s }, [x15]
+ ld1r { v16.4s }, [x2]
+ cbz x22, .LBB2_7
+ ldr q1, [sp, #16]
+ dup v0.4s, w20
+ ldp x15, x16, [x24]
+ ldp x17, x18, [x24, #16]
+ add v1.4s, v0.4s, v1.4s
+ movi v0.4s, #128, lsl #24
+ str q1, [sp, #64]
+ eor v0.16b, v1.16b, v0.16b
+ ldr q1, [sp]
+ lsr x2, x20, #32
+ mov x0, xzr
+ mov w6, w8
+ cmgt v0.4s, v1.4s, v0.4s
+ dup v1.4s, w2
+ sub v0.4s, v1.4s, v0.4s
+ str q0, [sp, #48]
+.LBB2_4:
+ mov w4, #16
+ stp q16, q17, [sp, #192]
+ bfi x4, x0, #6, #58
+ ldr q1, [x15, x4]
+ ldr q3, [x16, x4]
+ ldr q2, [x17, x4]
+ ldr q4, [x18, x4]
+ mov w4, #32
+ bfi x4, x0, #6, #58
+ ldr q5, [x15, x4]
+ ldr q20, [x16, x4]
+ ldr q21, [x17, x4]
+ ldr q22, [x18, x4]
+ mov w4, #48
+ lsl x3, x0, #6
+ bfi x4, x0, #6, #58
+ add x0, x0, #1
+ ldr q0, [x15, x3]
+ ldr q23, [x16, x3]
+ ldr q16, [x17, x3]
+ ldr q17, [x18, x3]
+ cmp x0, x22
+ ldr q25, [x15, x4]
+ ldr q14, [x16, x4]
+ ldr q28, [x17, x4]
+ ldr q31, [x18, x4]
+ csel w4, w27, wzr, eq
+ orr w4, w4, w6
+ mov x2, xzr
+ and w6, w4, #0xff
+ add x3, x3, #256
+.LBB2_5:
+ ldr x4, [x24, x2]
+ add x2, x2, #8
+ cmp x2, #32
+ add x4, x4, x3
+ prfm pldl1keep, [x4]
+ b.ne .LBB2_5
+ zip1 v29.4s, v0.4s, v23.4s
+ zip2 v23.4s, v0.4s, v23.4s
+ zip1 v0.4s, v16.4s, v17.4s
+ zip2 v24.4s, v16.4s, v17.4s
+ zip1 v9.4s, v1.4s, v3.4s
+ zip2 v26.4s, v1.4s, v3.4s
+ zip1 v27.4s, v2.4s, v4.4s
+ zip2 v17.4s, v2.4s, v4.4s
+ zip1 v12.4s, v21.4s, v22.4s
+ zip2 v13.4s, v21.4s, v22.4s
+ add v2.4s, v7.4s, v10.4s
+ add v1.4s, v18.4s, v11.4s
+ ext v7.16b, v0.16b, v29.16b, #8
+ ext v22.16b, v24.16b, v23.16b, #8
+ zip1 v30.4s, v5.4s, v20.4s
+ zip2 v20.4s, v5.4s, v20.4s
+ stp q1, q2, [sp, #112]
+ ext v2.16b, v29.16b, v7.16b, #8
+ mov v29.d[1], v0.d[0]
+ ext v18.16b, v23.16b, v22.16b, #8
+ mov v23.d[1], v24.d[0]
+ zip1 v21.4s, v25.4s, v14.4s
+ zip2 v4.4s, v25.4s, v14.4s
+ zip1 v14.4s, v28.4s, v31.4s
+ zip2 v15.4s, v28.4s, v31.4s
+ add v8.4s, v6.4s, v19.4s
+ ext v28.16b, v27.16b, v9.16b, #8
+ ext v31.16b, v17.16b, v26.16b, #8
+ stur q2, [x29, #-208]
+ mov v7.16b, v29.16b
+ ext v0.16b, v12.16b, v30.16b, #8
+ stp q23, q29, [x29, #-80]
+ mov v2.16b, v19.16b
+ ext v19.16b, v13.16b, v20.16b, #8
+ mov v29.16b, v9.16b
+ ext v25.16b, v9.16b, v28.16b, #8
+ mov v29.d[1], v27.d[0]
+ ext v24.16b, v26.16b, v31.16b, #8
+ mov v26.d[1], v17.d[0]
+ ext v17.16b, v15.16b, v4.16b, #8
+ ext v27.16b, v30.16b, v0.16b, #8
+ ext v0.16b, v20.16b, v19.16b, #8
+ stp q0, q25, [sp, #80]
+ ext v0.16b, v4.16b, v17.16b, #8
+ str q0, [sp, #224]
+ ldr q0, [sp, #128]
+ mov v6.16b, v23.16b
+ mov v22.16b, v4.16b
+ ldr q16, [x9, :lo12:.LCPI2_1]
+ add v17.4s, v0.4s, v7.4s
+ ldr q0, [sp, #112]
+ mov v30.d[1], v12.d[0]
+ add v7.4s, v8.4s, v29.4s
+ mov v20.d[1], v13.d[0]
+ add v4.4s, v0.4s, v6.4s
+ ldr q0, [sp, #64]
+ dup v3.4s, w12
+ ext v28.16b, v14.16b, v21.16b, #8
+ dup v1.4s, w10
+ eor v19.16b, v17.16b, v0.16b
+ ldr q0, [sp, #48]
+ ext v23.16b, v21.16b, v28.16b, #8
+ mov v21.d[1], v14.d[0]
+ tbl v14.16b, { v19.16b }, v16.16b
+ eor v12.16b, v4.16b, v0.16b
+ movi v0.4s, #64
+ eor v13.16b, v7.16b, v0.16b
+ tbl v13.16b, { v13.16b }, v16.16b
+ add v6.4s, v13.4s, v3.4s
+ dup v5.4s, w11
+ tbl v12.16b, { v12.16b }, v16.16b
+ add v1.4s, v14.4s, v1.4s
+ eor v9.16b, v6.16b, v2.16b
+ ldp q2, q0, [sp, #192]
+ add v5.4s, v12.4s, v5.4s
+ eor v19.16b, v1.16b, v10.16b
+ eor v10.16b, v5.16b, v11.16b
+ ushr v11.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v11.16b, v19.16b, v11.16b
+ ushr v19.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ mov v22.d[1], v15.d[0]
+ orr v10.16b, v10.16b, v19.16b
+ ushr v19.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ add v15.4s, v0.4s, v2.4s
+ orr v9.16b, v9.16b, v19.16b
+ dup v19.4s, w6
+ add v15.4s, v15.4s, v26.4s
+ eor v19.16b, v15.16b, v19.16b
+ tbl v3.16b, { v19.16b }, v16.16b
+ dup v19.4s, w13
+ add v8.4s, v3.4s, v19.4s
+ ldur q31, [x29, #-208]
+ eor v19.16b, v8.16b, v2.16b
+ ushr v0.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v2.16b, v19.16b, v0.16b
+ ldr q19, [x14, :lo12:.LCPI2_2]
+ add v17.4s, v17.4s, v31.4s
+ add v17.4s, v17.4s, v11.4s
+ eor v14.16b, v14.16b, v17.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ add v1.4s, v1.4s, v14.4s
+ eor v11.16b, v1.16b, v11.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v0.4s, v11.4s, #7
+ shl v11.4s, v11.4s, #25
+ add v4.4s, v4.4s, v10.4s
+ orr v0.16b, v11.16b, v0.16b
+ eor v11.16b, v12.16b, v4.16b
+ tbl v11.16b, { v11.16b }, v19.16b
+ add v5.4s, v5.4s, v11.4s
+ eor v10.16b, v5.16b, v10.16b
+ add v7.4s, v7.4s, v25.4s
+ ushr v12.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ add v7.4s, v7.4s, v9.4s
+ orr v10.16b, v10.16b, v12.16b
+ eor v12.16b, v13.16b, v7.16b
+ tbl v12.16b, { v12.16b }, v19.16b
+ add v6.4s, v6.4s, v12.4s
+ eor v9.16b, v6.16b, v9.16b
+ ushr v13.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ orr v9.16b, v9.16b, v13.16b
+ add v13.4s, v15.4s, v24.4s
+ add v13.4s, v13.4s, v2.4s
+ eor v3.16b, v3.16b, v13.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v8.4s, v8.4s, v3.4s
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v30.4s
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v21.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v22.4s
+ mov v28.16b, v26.16b
+ stur q26, [x29, #-112]
+ mov v26.16b, v18.16b
+ mov v18.16b, v24.16b
+ stur q24, [x29, #-160]
+ add v6.4s, v6.4s, v3.4s
+ mov v24.16b, v20.16b
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q20, [sp, #80]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ stp q30, q22, [x29, #-192]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ mov v30.16b, v27.16b
+ add v17.4s, v17.4s, v27.4s
+ ldr q27, [sp, #224]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v23.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v27.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ stur q21, [x29, #-144]
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ ldur q21, [x29, #-80]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v21.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v26.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v18.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v29.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-64]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v28.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v23.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-144]
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v31.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v27.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q27, [sp, #96]
+ mov v21.16b, v26.16b
+ stur q26, [x29, #-96]
+ mov v28.16b, v31.16b
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldp q31, q26, [x29, #-192]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v20.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v27.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v26.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v31.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ mov v18.16b, v24.16b
+ mov v24.16b, v20.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ldur q20, [x29, #-160]
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v21.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v18.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v23.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v20.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q25, [x29, #-80]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v29.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v25.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v26.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ ldur q25, [x29, #-112]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v30.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v24.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v31.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q25, [x29, #-64]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldr q31, [sp, #224]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v27.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v25.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v31.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v28.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v26.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v23.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ mov v21.16b, v29.16b
+ stur q29, [x29, #-128]
+ mov v29.16b, v30.16b
+ mov v30.16b, v27.16b
+ mov v27.16b, v18.16b
+ str q18, [sp, #176]
+ eor v0.16b, v0.16b, v1.16b
+ mov v18.16b, v22.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-96]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v20.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v29.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v31.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v21.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v28.4s
+ add v6.4s, v6.4s, v3.4s
+ mov v22.16b, v24.16b
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q24, [x29, #-80]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ mov v21.16b, v30.16b
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldur q30, [x29, #-192]
+ mov v20.16b, v29.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ ldur q29, [x29, #-112]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v29.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v20.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v31.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v26.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v23.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v27.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v30.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ ldur q27, [x29, #-160]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v27.4s
+ mov v28.16b, v25.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v21.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v28.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v29.4s
+ mov v25.16b, v31.16b
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q31, [x29, #-96]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldur q28, [x29, #-208]
+ mov v18.16b, v20.16b
+ str q20, [sp, #144]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ ldur q20, [x29, #-128]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v24.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v31.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v28.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v20.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v25.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ add v17.4s, v17.4s, v26.4s
+ mov v26.16b, v21.16b
+ add v4.4s, v4.4s, v21.4s
+ ldur q21, [x29, #-144]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v21.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v28.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ str q23, [sp, #160]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ add v17.4s, v17.4s, v23.4s
+ ldur q23, [x29, #-64]
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v23.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v24.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v20.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q20, [sp, #176]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ eor v12.16b, v12.16b, v13.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v31.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ add v7.4s, v7.4s, v29.4s
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v2.4s
+ orr v0.16b, v0.16b, v15.16b
+ mov v15.16b, v31.16b
+ add v17.4s, v17.4s, v22.4s
+ eor v31.16b, v14.16b, v4.16b
+ eor v22.16b, v11.16b, v7.16b
+ add v11.4s, v13.4s, v27.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v11.4s, v11.4s, v0.4s
+ tbl v31.16b, { v31.16b }, v19.16b
+ add v6.4s, v6.4s, v3.4s
+ eor v12.16b, v12.16b, v11.16b
+ tbl v22.16b, { v22.16b }, v19.16b
+ add v8.4s, v8.4s, v31.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v30.4s, v11.4s, v30.4s
+ tbl v11.16b, { v12.16b }, v19.16b
+ add v1.4s, v1.4s, v22.4s
+ eor v9.16b, v8.16b, v9.16b
+ ushr v12.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ add v5.4s, v5.4s, v11.4s
+ eor v2.16b, v1.16b, v2.16b
+ orr v10.16b, v10.16b, v12.16b
+ ushr v12.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v9.16b, v9.16b, v12.16b
+ ushr v12.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v2.16b, v2.16b, v12.16b
+ ushr v12.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v12.16b
+ add v4.4s, v4.4s, v26.4s
+ add v17.4s, v17.4s, v0.4s
+ add v7.4s, v7.4s, v28.4s
+ mov v18.16b, v27.16b
+ eor v31.16b, v31.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v27.4s, v30.4s, v2.4s
+ eor v22.16b, v22.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ eor v3.16b, v3.16b, v27.16b
+ add v26.4s, v27.4s, v29.4s
+ tbl v27.16b, { v31.16b }, v16.16b
+ eor v28.16b, v11.16b, v7.16b
+ tbl v22.16b, { v22.16b }, v16.16b
+ add v1.4s, v1.4s, v27.4s
+ add v4.4s, v4.4s, v23.4s
+ ldr q23, [sp, #144]
+ tbl v28.16b, { v28.16b }, v16.16b
+ tbl v3.16b, { v3.16b }, v16.16b
+ add v5.4s, v5.4s, v22.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v6.4s, v6.4s, v28.4s
+ add v29.4s, v8.4s, v3.4s
+ eor v30.16b, v5.16b, v10.16b
+ ushr v8.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v31.16b, v6.16b, v9.16b
+ orr v0.16b, v0.16b, v8.16b
+ ushr v8.4s, v30.4s, #12
+ shl v30.4s, v30.4s, #20
+ eor v2.16b, v29.16b, v2.16b
+ orr v30.16b, v30.16b, v8.16b
+ ushr v8.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ add v17.4s, v17.4s, v25.4s
+ add v7.4s, v7.4s, v23.4s
+ orr v31.16b, v31.16b, v8.16b
+ ushr v8.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ldur q23, [x29, #-176]
+ orr v2.16b, v2.16b, v8.16b
+ add v17.4s, v17.4s, v0.4s
+ eor v27.16b, v27.16b, v17.16b
+ add v4.4s, v4.4s, v30.4s
+ add v25.4s, v26.4s, v2.4s
+ eor v22.16b, v22.16b, v4.16b
+ add v4.4s, v4.4s, v24.4s
+ add v7.4s, v7.4s, v31.4s
+ eor v3.16b, v3.16b, v25.16b
+ add v24.4s, v25.4s, v18.4s
+ tbl v25.16b, { v27.16b }, v19.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v23.16b, v28.16b, v7.16b
+ tbl v22.16b, { v22.16b }, v19.16b
+ add v1.4s, v1.4s, v25.4s
+ tbl v23.16b, { v23.16b }, v19.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v5.4s, v5.4s, v22.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v6.4s, v6.4s, v23.4s
+ add v26.4s, v29.4s, v3.4s
+ eor v27.16b, v5.16b, v30.16b
+ ushr v29.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v28.16b, v6.16b, v31.16b
+ orr v0.16b, v0.16b, v29.16b
+ ushr v29.4s, v27.4s, #7
+ shl v27.4s, v27.4s, #25
+ eor v2.16b, v26.16b, v2.16b
+ orr v27.16b, v27.16b, v29.16b
+ ushr v29.4s, v28.4s, #7
+ shl v28.4s, v28.4s, #25
+ ldur q18, [x29, #-128]
+ orr v28.16b, v28.16b, v29.16b
+ ushr v29.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v7.4s, v7.4s, v15.4s
+ orr v2.16b, v2.16b, v29.16b
+ add v17.4s, v17.4s, v27.4s
+ add v4.4s, v4.4s, v28.4s
+ add v7.4s, v7.4s, v2.4s
+ eor v3.16b, v3.16b, v17.16b
+ add v17.4s, v17.4s, v20.4s
+ eor v20.16b, v25.16b, v4.16b
+ add v4.4s, v4.4s, v21.4s
+ eor v21.16b, v22.16b, v7.16b
+ add v7.4s, v7.4s, v18.4s
+ add v18.4s, v24.4s, v0.4s
+ eor v22.16b, v23.16b, v18.16b
+ ldr q23, [sp, #160]
+ tbl v3.16b, { v3.16b }, v16.16b
+ tbl v20.16b, { v20.16b }, v16.16b
+ add v6.4s, v6.4s, v3.4s
+ add v18.4s, v18.4s, v23.4s
+ tbl v21.16b, { v21.16b }, v16.16b
+ tbl v16.16b, { v22.16b }, v16.16b
+ add v22.4s, v26.4s, v20.4s
+ eor v23.16b, v6.16b, v27.16b
+ add v1.4s, v1.4s, v21.4s
+ eor v24.16b, v22.16b, v28.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v5.4s, v5.4s, v16.4s
+ eor v2.16b, v1.16b, v2.16b
+ orr v23.16b, v23.16b, v25.16b
+ ushr v25.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ orr v24.16b, v24.16b, v25.16b
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ ushr v25.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ orr v0.16b, v0.16b, v25.16b
+ add v25.4s, v7.4s, v2.4s
+ add v26.4s, v18.4s, v0.4s
+ eor v18.16b, v21.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ add v4.4s, v4.4s, v24.4s
+ eor v16.16b, v16.16b, v26.16b
+ tbl v21.16b, { v18.16b }, v19.16b
+ eor v3.16b, v3.16b, v17.16b
+ eor v7.16b, v20.16b, v4.16b
+ tbl v16.16b, { v16.16b }, v19.16b
+ add v1.4s, v1.4s, v21.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ tbl v20.16b, { v7.16b }, v19.16b
+ eor v2.16b, v1.16b, v2.16b
+ eor v7.16b, v1.16b, v17.16b
+ add v1.4s, v5.4s, v16.4s
+ eor v0.16b, v1.16b, v0.16b
+ eor v18.16b, v1.16b, v4.16b
+ add v1.4s, v6.4s, v3.4s
+ eor v4.16b, v1.16b, v23.16b
+ eor v6.16b, v25.16b, v1.16b
+ add v1.4s, v22.4s, v20.4s
+ eor v5.16b, v1.16b, v24.16b
+ eor v17.16b, v26.16b, v1.16b
+ ushr v1.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ orr v1.16b, v4.16b, v1.16b
+ ushr v4.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v4.16b, v5.16b, v4.16b
+ ushr v5.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v2.16b, v2.16b, v5.16b
+ ushr v5.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v5.16b
+ eor v10.16b, v0.16b, v20.16b
+ eor v11.16b, v1.16b, v21.16b
+ eor v19.16b, v4.16b, v16.16b
+ cmp x0, x22
+ eor v16.16b, v2.16b, v3.16b
+ mov w6, w19
+ b.ne .LBB2_4
+.LBB2_7:
+ zip1 v0.4s, v7.4s, v18.4s
+ zip2 v1.4s, v7.4s, v18.4s
+ zip1 v2.4s, v6.4s, v17.4s
+ zip2 v3.4s, v6.4s, v17.4s
+ zip1 v4.4s, v10.4s, v11.4s
+ zip2 v5.4s, v10.4s, v11.4s
+ zip1 v6.4s, v19.4s, v16.4s
+ zip2 v7.4s, v19.4s, v16.4s
+ add x15, x20, #4
+ tst w5, #0x1
+ sub x28, x28, #4
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v0.2d, v0.2d, v2.2d
+ zip1 v2.2d, v1.2d, v3.2d
+ zip2 v1.2d, v1.2d, v3.2d
+ zip1 v3.2d, v4.2d, v6.2d
+ zip2 v4.2d, v4.2d, v6.2d
+ zip1 v6.2d, v5.2d, v7.2d
+ zip2 v5.2d, v5.2d, v7.2d
+ add x24, x24, #32
+ csel x20, x15, x20, ne
+ cmp x28, #3
+ stp q16, q3, [x26]
+ stp q0, q4, [x26, #32]
+ stp q2, q6, [x26, #64]
+ stp q1, q5, [x26, #96]
+ add x26, x26, #128
+ b.hi .LBB2_2
+.LBB2_8:
+ cbz x28, .LBB2_16
+ orr w8, w7, w19
+ and x21, x5, #0x1
+ stur w8, [x29, #-64]
+.LBB2_10:
+ ldr x8, [sp, #40]
+ ldr x25, [x24]
+ ldur w4, [x29, #-64]
+ ldp q1, q0, [x8]
+ mov x8, x22
+ stp q1, q0, [x29, #-48]
+.LBB2_11:
+ subs x23, x8, #1
+ b.eq .LBB2_13
+ cbnz x8, .LBB2_14
+ b .LBB2_15
+.LBB2_13:
+ orr w4, w4, w27
+.LBB2_14:
+ sub x0, x29, #48
+ mov w2, #64
+ mov x1, x25
+ mov x3, x20
+ bl zfs_blake3_compress_in_place_sse41
+ add x25, x25, #64
+ mov x8, x23
+ mov w4, w19
+ b .LBB2_11
+.LBB2_15:
+ ldp q0, q1, [x29, #-48]
+ add x20, x20, x21
+ add x24, x24, #8
+ subs x28, x28, #1
+ stp q0, q1, [x26], #32
+ b.ne .LBB2_10
+.LBB2_16:
+ add sp, sp, #448
+ ldp x20, x19, [sp, #144]
+ ldp x22, x21, [sp, #128]
+ ldp x24, x23, [sp, #112]
+ ldp x26, x25, [sp, #96]
+ ldp x28, x27, [sp, #80]
+ ldp x29, x30, [sp, #64]
+ ldp d9, d8, [sp, #48]
+ ldp d11, d10, [sp, #32]
+ ldp d13, d12, [sp, #16]
+ ldp d15, d14, [sp], #160
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S
new file mode 100644
index 000000000..9deba202f
--- /dev/null
+++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S
@@ -0,0 +1,2823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE2 -> POWER8 PPC64 Little Endian
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ .text
+ .abiversion 2
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI0_1:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI0_2:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_3:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_4:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_5:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_6:
+ .short 1
+ .short 2
+ .short 4
+ .short 8
+ .short 16
+ .short 32
+ .short 64
+ .short 128
+.LCPI0_7:
+ .short 0
+ .short 0
+ .short 4
+ .short 8
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI0_8:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_9:
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI0_10:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_12:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_13:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_14:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .text
+ .globl zfs_blake3_compress_in_place_sse2
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse2,@function
+zfs_blake3_compress_in_place_sse2:
+.Lfunc_begin0:
+ .cfi_startproc
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry zfs_blake3_compress_in_place_sse2, .Lfunc_lep0-.Lfunc_gep0
+ li 8, -64
+ mtvsrd 35, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 12, 9
+ stxvd2x 60, 1, 8
+ li 8, -48
+ mtvsrd 36, 7
+ lfd 2, 16(4)
+ stxvd2x 61, 1, 8
+ li 8, -32
+ lfd 1, 8(4)
+ mtvsrwz 37, 6
+ rldicl 6, 6, 32, 32
+ addis 7, 2, .LCPI0_2@toc@ha
+ stxvd2x 62, 1, 8
+ li 8, -16
+ addi 7, 7, .LCPI0_2@toc@l
+ stxvd2x 63, 1, 8
+ li 8, 0
+ lvx 9, 0, 7
+ li 7, 48
+ mtvsrd 34, 8
+ xxmrghd 32, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ lfd 3, 24(4)
+ addis 8, 2, .LCPI0_5@toc@ha
+ vmrghb 3, 2, 3
+ addi 8, 8, .LCPI0_5@toc@l
+ vmrghb 4, 2, 4
+ vspltb 2, 2, 7
+ xxmrghd 33, 3, 2
+ vpkudum 7, 1, 0
+ vmrglh 3, 2, 3
+ vmrglh 2, 2, 4
+ mtvsrwz 36, 6
+ addis 6, 2, .LCPI0_0@toc@ha
+ addi 6, 6, .LCPI0_0@toc@l
+ vperm 10, 1, 0, 9
+ vmrghw 4, 4, 5
+ xxswapd 37, 1
+ lxvd2x 1, 4, 7
+ addis 7, 2, .LCPI0_8@toc@ha
+ addi 7, 7, .LCPI0_8@toc@l
+ vmrglw 2, 2, 3
+ xxswapd 35, 0
+ xxswapd 41, 1
+ xxspltd 62, 42, 1
+ vadduwm 3, 7, 3
+ vadduwm 6, 3, 5
+ xxmrgld 36, 34, 36
+ lvx 2, 0, 6
+ addis 6, 2, .LCPI0_1@toc@ha
+ addi 6, 6, .LCPI0_1@toc@l
+ xxlxor 35, 38, 36
+ lvx 4, 0, 6
+ li 6, 32
+ lxvd2x 0, 4, 6
+ addis 4, 2, .LCPI0_3@toc@ha
+ addis 6, 2, .LCPI0_7@toc@ha
+ vperm 8, 3, 3, 2
+ vspltisw 3, 10
+ addi 4, 4, .LCPI0_3@toc@l
+ addi 6, 6, .LCPI0_7@toc@l
+ vadduwm 3, 3, 3
+ vadduwm 11, 8, 4
+ xxlxor 36, 43, 37
+ vadduwm 5, 6, 10
+ vrlw 0, 4, 3
+ vspltisw 4, 12
+ vadduwm 4, 4, 4
+ vadduwm 1, 0, 5
+ xxlxor 37, 33, 40
+ xxswapd 40, 0
+ vrlw 6, 5, 4
+ vspltisw 5, -16
+ vpkudum 13, 9, 8
+ vsubuwm 5, 12, 5
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI0_4@toc@ha
+ addi 4, 4, .LCPI0_4@toc@l
+ vadduwm 11, 6, 11
+ xxswapd 0, 38
+ vadduwm 1, 1, 13
+ xxsldwi 50, 45, 45, 1
+ xxlxor 32, 43, 32
+ xxsldwi 43, 43, 43, 3
+ xxsldwi 33, 33, 33, 1
+ vperm 12, 8, 9, 12
+ vrlw 0, 0, 5
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 0
+ vadduwm 1, 1, 12
+ vperm 6, 6, 6, 2
+ vadduwm 15, 6, 11
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI0_6@toc@ha
+ addi 4, 4, .LCPI0_6@toc@l
+ xxlxor 32, 47, 32
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI0_9@toc@ha
+ vperm 14, 10, 7, 11
+ addi 4, 4, .LCPI0_9@toc@l
+ vrlw 0, 0, 3
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 38
+ vrlw 6, 6, 4
+ vadduwm 8, 6, 15
+ xxswapd 0, 38
+ lvx 6, 0, 8
+ xxlxor 32, 40, 32
+ xxsldwi 40, 40, 40, 1
+ vperm 13, 12, 18, 6
+ vrlw 9, 0, 5
+ vadduwm 0, 1, 14
+ lvx 1, 0, 7
+ xxsldwi 46, 46, 46, 3
+ xxsldwi 32, 32, 32, 3
+ vperm 7, 7, 7, 1
+ vadduwm 15, 9, 0
+ xxlxor 32, 47, 0
+ vperm 16, 0, 0, 2
+ lvx 0, 0, 6
+ addis 6, 2, .LCPI0_10@toc@ha
+ vcmpequh 0, 0, 17
+ vadduwm 19, 16, 8
+ xxlxor 40, 51, 41
+ xxsel 45, 39, 45, 32
+ vrlw 31, 8, 3
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI0_11@toc@ha
+ addi 4, 4, .LCPI0_11@toc@l
+ vcmpequh 7, 8, 17
+ vadduwm 8, 15, 13
+ vadduwm 15, 31, 8
+ lvx 8, 0, 4
+ addi 4, 6, .LCPI0_10@toc@l
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI0_12@toc@ha
+ xxlxor 41, 47, 48
+ xxsldwi 47, 47, 47, 1
+ addi 4, 4, .LCPI0_12@toc@l
+ xxlnor 48, 39, 39
+ vrlw 29, 9, 4
+ vperm 9, 16, 16, 8
+ xxland 48, 50, 39
+ vperm 17, 30, 12, 17
+ vperm 16, 16, 16, 8
+ vmrghw 12, 12, 10
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI0_13@toc@ha
+ vadduwm 19, 29, 19
+ addi 4, 4, .LCPI0_13@toc@l
+ xxlxor 63, 51, 63
+ xxsldwi 51, 51, 51, 3
+ xxland 0, 49, 41
+ vrlw 17, 31, 5
+ xxlor 48, 0, 48
+ xxswapd 0, 61
+ vperm 18, 12, 18, 10
+ vadduwm 15, 15, 16
+ xxland 60, 48, 39
+ vadduwm 15, 17, 15
+ vperm 28, 28, 28, 8
+ xxlxor 63, 47, 0
+ vadduwm 15, 15, 18
+ vperm 31, 31, 31, 2
+ vperm 30, 18, 16, 6
+ vadduwm 19, 31, 19
+ xxlxor 44, 51, 49
+ vrlw 12, 12, 3
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 63
+ vperm 31, 13, 14, 11
+ vrlw 17, 17, 4
+ vperm 14, 14, 14, 1
+ vadduwm 15, 15, 31
+ vadduwm 19, 17, 19
+ xxswapd 0, 49
+ xxsldwi 47, 47, 47, 3
+ xxsel 46, 46, 62, 32
+ xxlxor 44, 51, 44
+ xxsldwi 51, 51, 51, 1
+ vrlw 12, 12, 5
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 19, 17, 19
+ xxlxor 44, 51, 44
+ vrlw 29, 12, 3
+ vadduwm 12, 15, 14
+ vadduwm 15, 29, 12
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI0_14@toc@ha
+ addi 4, 4, .LCPI0_14@toc@l
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ vperm 30, 13, 18, 12
+ vrlw 17, 17, 4
+ vmrghw 13, 18, 13
+ xxland 0, 62, 41
+ vadduwm 19, 17, 19
+ vperm 16, 13, 16, 10
+ xxlxor 61, 51, 61
+ xxsldwi 50, 51, 51, 3
+ xxsldwi 51, 63, 63, 3
+ vrlw 30, 29, 5
+ xxlor 61, 60, 0
+ xxswapd 0, 49
+ vperm 31, 14, 19, 11
+ vadduwm 15, 15, 29
+ vperm 19, 19, 19, 1
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 16
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 62
+ vperm 30, 16, 29, 6
+ vrlw 13, 13, 3
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 31
+ xxsldwi 63, 63, 63, 3
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 45, 50, 45
+ xxsldwi 50, 50, 50, 1
+ vrlw 13, 13, 5
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 45
+ vrlw 28, 13, 3
+ xxsel 45, 51, 62, 32
+ xxland 51, 61, 39
+ vperm 30, 14, 16, 12
+ vadduwm 15, 15, 13
+ vperm 19, 19, 19, 8
+ vmrghw 14, 16, 14
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 62, 41
+ vrlw 17, 17, 4
+ xxlor 51, 51, 0
+ vadduwm 15, 15, 19
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 60, 50, 60
+ xxsldwi 48, 50, 50, 3
+ vperm 18, 14, 29, 10
+ vrlw 30, 28, 5
+ vperm 29, 18, 19, 6
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 18
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 62
+ vperm 30, 13, 31, 11
+ vrlw 14, 14, 3
+ vperm 31, 31, 31, 1
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 46, 48, 46
+ xxsldwi 48, 48, 48, 1
+ vrlw 14, 14, 5
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 46
+ vrlw 28, 14, 3
+ xxsel 46, 63, 61, 32
+ xxland 63, 51, 39
+ vperm 29, 13, 18, 12
+ vadduwm 15, 15, 14
+ vperm 31, 31, 31, 8
+ vmrghw 13, 18, 13
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 61, 41
+ vrlw 17, 17, 4
+ xxlor 63, 63, 0
+ vperm 13, 13, 19, 10
+ xxsldwi 51, 62, 62, 3
+ vadduwm 15, 15, 31
+ vperm 30, 14, 19, 11
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 60, 48, 60
+ xxsldwi 48, 48, 48, 3
+ vrlw 29, 28, 5
+ vadduwm 15, 29, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 50, 48, 61
+ vrlw 18, 18, 3
+ vadduwm 15, 18, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 11, 17, 16
+ xxswapd 0, 49
+ xxlxor 48, 43, 50
+ xxsldwi 43, 43, 43, 1
+ vperm 18, 19, 19, 1
+ vrlw 16, 16, 5
+ vperm 19, 13, 31, 6
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 29, 17, 11
+ xxlxor 43, 61, 48
+ vrlw 16, 11, 3
+ xxsel 43, 50, 51, 32
+ xxland 50, 63, 39
+ vperm 19, 14, 13, 12
+ vadduwm 15, 15, 11
+ vperm 18, 18, 18, 8
+ vmrghw 13, 13, 14
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 51, 41
+ lvx 19, 0, 4
+ vrlw 17, 17, 4
+ xxlor 50, 50, 0
+ vperm 13, 13, 31, 10
+ xxsldwi 63, 62, 62, 3
+ vadduwm 15, 15, 18
+ vperm 19, 11, 31, 19
+ vadduwm 29, 17, 29
+ xxswapd 0, 49
+ vperm 1, 31, 31, 1
+ xxlxor 48, 61, 48
+ xxsldwi 46, 61, 61, 3
+ vperm 6, 13, 18, 6
+ vrlw 16, 16, 5
+ xxsel 32, 33, 38, 32
+ xxland 38, 50, 39
+ vadduwm 15, 16, 15
+ vperm 7, 11, 13, 12
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vperm 6, 6, 6, 8
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 3
+ vrlw 17, 17, 4
+ vadduwm 15, 15, 19
+ vadduwm 14, 17, 14
+ xxswapd 0, 49
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 5
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vadduwm 0, 15, 0
+ vperm 17, 17, 17, 2
+ xxland 0, 39, 41
+ xxlor 38, 38, 0
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 0, 16, 0
+ xxlxor 33, 32, 49
+ xxsldwi 32, 32, 32, 1
+ vrlw 1, 1, 4
+ vadduwm 0, 0, 6
+ vadduwm 8, 1, 14
+ xxswapd 0, 33
+ xxlxor 44, 40, 48
+ xxsldwi 38, 40, 40, 3
+ vrlw 7, 12, 5
+ vadduwm 0, 7, 0
+ xxlxor 33, 32, 0
+ vperm 2, 1, 1, 2
+ vmrghw 1, 13, 11
+ vadduwm 6, 2, 6
+ vperm 1, 1, 18, 10
+ xxlxor 39, 38, 39
+ vrlw 3, 7, 3
+ vadduwm 0, 0, 1
+ vadduwm 0, 3, 0
+ xxlxor 34, 32, 34
+ xxsldwi 0, 32, 32, 3
+ vrlw 2, 2, 4
+ vadduwm 4, 2, 6
+ xxswapd 2, 34
+ xxlxor 35, 36, 35
+ xxsldwi 1, 36, 36, 1
+ vrlw 3, 3, 5
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 1, 35, 2
+ stxvd2x 0, 0, 3
+ xxswapd 1, 1
+ stxvd2x 1, 3, 5
+ li 3, -16
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI1_1:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI1_2:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_3:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_4:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_5:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_6:
+ .short 1
+ .short 2
+ .short 4
+ .short 8
+ .short 16
+ .short 32
+ .short 64
+ .short 128
+.LCPI1_7:
+ .short 0
+ .short 0
+ .short 4
+ .short 8
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI1_8:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_9:
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI1_10:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_12:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_13:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_14:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .text
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+.Lfunc_begin1:
+ .cfi_startproc
+.Lfunc_gep1:
+ addis 2, 12, .TOC.-.Lfunc_gep1@ha
+ addi 2, 2, .TOC.-.Lfunc_gep1@l
+.Lfunc_lep1:
+ .localentry zfs_blake3_compress_xof_sse2, .Lfunc_lep1-.Lfunc_gep1
+ li 9, -80
+ mtvsrd 35, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ addis 10, 2, .LCPI1_2@toc@ha
+ vspltisw 12, 9
+ std 30, -16(1)
+ addis 12, 2, .LCPI1_8@toc@ha
+ addis 30, 2, .LCPI1_5@toc@ha
+ addis 11, 2, .LCPI1_7@toc@ha
+ stxvd2x 60, 1, 9
+ li 9, -64
+ mtvsrd 36, 7
+ lfd 2, 16(4)
+ addi 10, 10, .LCPI1_2@toc@l
+ addi 12, 12, .LCPI1_8@toc@l
+ addi 11, 11, .LCPI1_7@toc@l
+ stxvd2x 61, 1, 9
+ li 9, -48
+ lfd 3, 24(4)
+ mtvsrwz 37, 6
+ rldicl 6, 6, 32, 32
+ lvx 9, 0, 10
+ stxvd2x 62, 1, 9
+ li 9, -32
+ li 10, 32
+ stxvd2x 63, 1, 9
+ li 9, 0
+ mtvsrd 34, 9
+ xxmrghd 33, 3, 2
+ lfd 1, 8(4)
+ vmrghb 3, 2, 3
+ vmrghb 4, 2, 4
+ vspltb 2, 2, 7
+ xxmrghd 32, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ vpkudum 7, 1, 0
+ vmrglh 3, 2, 3
+ vmrglh 2, 2, 4
+ mtvsrwz 36, 6
+ addis 6, 2, .LCPI1_0@toc@ha
+ addi 6, 6, .LCPI1_0@toc@l
+ vperm 10, 1, 0, 9
+ vmrghw 4, 4, 5
+ xxswapd 37, 1
+ vmrglw 2, 2, 3
+ xxswapd 35, 0
+ lxvd2x 0, 4, 10
+ xxspltd 62, 42, 1
+ vadduwm 3, 7, 3
+ vadduwm 6, 3, 5
+ xxmrgld 36, 34, 36
+ lvx 2, 0, 6
+ addis 6, 2, .LCPI1_1@toc@ha
+ addi 6, 6, .LCPI1_1@toc@l
+ xxlxor 35, 38, 36
+ lvx 4, 0, 6
+ li 6, 48
+ lxvd2x 1, 4, 6
+ addis 4, 2, .LCPI1_3@toc@ha
+ vperm 8, 3, 3, 2
+ vspltisw 3, 10
+ addi 4, 4, .LCPI1_3@toc@l
+ xxswapd 41, 1
+ vadduwm 3, 3, 3
+ vadduwm 11, 8, 4
+ xxlxor 36, 43, 37
+ vadduwm 5, 6, 10
+ vrlw 0, 4, 3
+ vspltisw 4, 12
+ vadduwm 4, 4, 4
+ vadduwm 1, 0, 5
+ xxlxor 37, 33, 40
+ xxswapd 40, 0
+ vrlw 6, 5, 4
+ vspltisw 5, -16
+ vpkudum 13, 9, 8
+ vsubuwm 5, 12, 5
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI1_4@toc@ha
+ addi 4, 4, .LCPI1_4@toc@l
+ vadduwm 11, 6, 11
+ xxswapd 0, 38
+ vadduwm 1, 1, 13
+ xxsldwi 50, 45, 45, 1
+ xxlxor 32, 43, 32
+ xxsldwi 43, 43, 43, 3
+ xxsldwi 33, 33, 33, 1
+ vperm 12, 8, 9, 12
+ vrlw 0, 0, 5
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 0
+ vadduwm 1, 1, 12
+ vperm 6, 6, 6, 2
+ vadduwm 15, 6, 11
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI1_6@toc@ha
+ addi 4, 4, .LCPI1_6@toc@l
+ xxlxor 32, 47, 32
+ lvx 17, 0, 4
+ addi 4, 30, .LCPI1_5@toc@l
+ vperm 14, 10, 7, 11
+ vrlw 0, 0, 3
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 38
+ vrlw 6, 6, 4
+ vadduwm 8, 6, 15
+ xxswapd 0, 38
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI1_9@toc@ha
+ addi 4, 4, .LCPI1_9@toc@l
+ xxlxor 32, 40, 32
+ xxsldwi 40, 40, 40, 1
+ vperm 13, 12, 18, 6
+ vrlw 9, 0, 5
+ vadduwm 0, 1, 14
+ lvx 1, 0, 12
+ xxsldwi 46, 46, 46, 3
+ xxsldwi 32, 32, 32, 3
+ vperm 7, 7, 7, 1
+ vadduwm 15, 9, 0
+ xxlxor 32, 47, 0
+ vperm 16, 0, 0, 2
+ lvx 0, 0, 11
+ addis 11, 2, .LCPI1_10@toc@ha
+ vcmpequh 0, 0, 17
+ vadduwm 19, 16, 8
+ xxlxor 40, 51, 41
+ xxsel 45, 39, 45, 32
+ vrlw 31, 8, 3
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI1_11@toc@ha
+ addi 4, 4, .LCPI1_11@toc@l
+ vcmpequh 7, 8, 17
+ vadduwm 8, 15, 13
+ vadduwm 15, 31, 8
+ lvx 8, 0, 4
+ addi 4, 11, .LCPI1_10@toc@l
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI1_12@toc@ha
+ xxlxor 41, 47, 48
+ xxsldwi 47, 47, 47, 1
+ addi 4, 4, .LCPI1_12@toc@l
+ xxlnor 48, 39, 39
+ vrlw 29, 9, 4
+ vperm 9, 16, 16, 8
+ xxland 48, 50, 39
+ vperm 17, 30, 12, 17
+ vperm 16, 16, 16, 8
+ vmrghw 12, 12, 10
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI1_13@toc@ha
+ vadduwm 19, 29, 19
+ addi 4, 4, .LCPI1_13@toc@l
+ xxlxor 63, 51, 63
+ xxsldwi 51, 51, 51, 3
+ xxland 0, 49, 41
+ vrlw 17, 31, 5
+ xxlor 48, 0, 48
+ xxswapd 0, 61
+ vperm 18, 12, 18, 10
+ vadduwm 15, 15, 16
+ xxland 60, 48, 39
+ vadduwm 15, 17, 15
+ vperm 28, 28, 28, 8
+ xxlxor 63, 47, 0
+ vadduwm 15, 15, 18
+ vperm 31, 31, 31, 2
+ vperm 30, 18, 16, 6
+ vadduwm 19, 31, 19
+ xxlxor 44, 51, 49
+ vrlw 12, 12, 3
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 63
+ vperm 31, 13, 14, 11
+ vrlw 17, 17, 4
+ vperm 14, 14, 14, 1
+ vadduwm 15, 15, 31
+ vadduwm 19, 17, 19
+ xxswapd 0, 49
+ xxsldwi 47, 47, 47, 3
+ xxsel 46, 46, 62, 32
+ xxlxor 44, 51, 44
+ xxsldwi 51, 51, 51, 1
+ vrlw 12, 12, 5
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 19, 17, 19
+ xxlxor 44, 51, 44
+ vrlw 29, 12, 3
+ vadduwm 12, 15, 14
+ vadduwm 15, 29, 12
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI1_14@toc@ha
+ addi 4, 4, .LCPI1_14@toc@l
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ vperm 30, 13, 18, 12
+ vrlw 17, 17, 4
+ vmrghw 13, 18, 13
+ xxland 0, 62, 41
+ vadduwm 19, 17, 19
+ vperm 16, 13, 16, 10
+ xxlxor 61, 51, 61
+ xxsldwi 50, 51, 51, 3
+ xxsldwi 51, 63, 63, 3
+ vrlw 30, 29, 5
+ xxlor 61, 60, 0
+ xxswapd 0, 49
+ vperm 31, 14, 19, 11
+ vadduwm 15, 15, 29
+ vperm 19, 19, 19, 1
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 16
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 62
+ vperm 30, 16, 29, 6
+ vrlw 13, 13, 3
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 31
+ xxsldwi 63, 63, 63, 3
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 45, 50, 45
+ xxsldwi 50, 50, 50, 1
+ vrlw 13, 13, 5
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 45
+ vrlw 28, 13, 3
+ xxsel 45, 51, 62, 32
+ xxland 51, 61, 39
+ vperm 30, 14, 16, 12
+ vadduwm 15, 15, 13
+ vperm 19, 19, 19, 8
+ vmrghw 14, 16, 14
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 62, 41
+ vrlw 17, 17, 4
+ xxlor 51, 51, 0
+ vadduwm 15, 15, 19
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 60, 50, 60
+ xxsldwi 48, 50, 50, 3
+ vperm 18, 14, 29, 10
+ vrlw 30, 28, 5
+ vperm 29, 18, 19, 6
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 18
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 62
+ vperm 30, 13, 31, 11
+ vrlw 14, 14, 3
+ vperm 31, 31, 31, 1
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 46, 48, 46
+ xxsldwi 48, 48, 48, 1
+ vrlw 14, 14, 5
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 46
+ vrlw 28, 14, 3
+ xxsel 46, 63, 61, 32
+ xxland 63, 51, 39
+ vperm 29, 13, 18, 12
+ vadduwm 15, 15, 14
+ vperm 31, 31, 31, 8
+ vmrghw 13, 18, 13
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 61, 41
+ vrlw 17, 17, 4
+ xxlor 63, 63, 0
+ vperm 13, 13, 19, 10
+ xxsldwi 51, 62, 62, 3
+ vadduwm 15, 15, 31
+ vperm 30, 14, 19, 11
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 60, 48, 60
+ xxsldwi 48, 48, 48, 3
+ vrlw 29, 28, 5
+ vadduwm 15, 29, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 50, 48, 61
+ vrlw 18, 18, 3
+ vadduwm 15, 18, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 11, 17, 16
+ xxswapd 0, 49
+ xxlxor 48, 43, 50
+ xxsldwi 43, 43, 43, 1
+ vperm 18, 19, 19, 1
+ vrlw 16, 16, 5
+ vperm 19, 13, 31, 6
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 29, 17, 11
+ xxlxor 43, 61, 48
+ vrlw 16, 11, 3
+ xxsel 43, 50, 51, 32
+ xxland 50, 63, 39
+ vperm 19, 14, 13, 12
+ vadduwm 15, 15, 11
+ vperm 18, 18, 18, 8
+ vmrghw 13, 13, 14
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 51, 41
+ lvx 19, 0, 4
+ vrlw 17, 17, 4
+ xxlor 50, 50, 0
+ vperm 13, 13, 31, 10
+ xxsldwi 63, 62, 62, 3
+ vadduwm 15, 15, 18
+ vperm 19, 11, 31, 19
+ vadduwm 29, 17, 29
+ xxswapd 0, 49
+ vperm 1, 31, 31, 1
+ xxlxor 48, 61, 48
+ xxsldwi 46, 61, 61, 3
+ vperm 6, 13, 18, 6
+ vrlw 16, 16, 5
+ xxsel 32, 33, 38, 32
+ xxland 38, 50, 39
+ vadduwm 15, 16, 15
+ vperm 7, 11, 13, 12
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vperm 6, 6, 6, 8
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 3
+ vrlw 17, 17, 4
+ vadduwm 15, 15, 19
+ vadduwm 14, 17, 14
+ xxswapd 0, 49
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 5
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vadduwm 0, 15, 0
+ vperm 17, 17, 17, 2
+ xxland 0, 39, 41
+ xxlor 38, 38, 0
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 0, 16, 0
+ xxlxor 33, 32, 49
+ xxsldwi 32, 32, 32, 1
+ vrlw 1, 1, 4
+ vadduwm 0, 0, 6
+ vadduwm 8, 1, 14
+ xxswapd 0, 33
+ xxlxor 44, 40, 48
+ xxsldwi 38, 40, 40, 3
+ vrlw 7, 12, 5
+ vadduwm 0, 7, 0
+ xxlxor 33, 32, 0
+ vperm 2, 1, 1, 2
+ vmrghw 1, 13, 11
+ vadduwm 6, 2, 6
+ vperm 1, 1, 18, 10
+ xxlxor 39, 38, 39
+ vrlw 3, 7, 3
+ vadduwm 0, 0, 1
+ vadduwm 0, 3, 0
+ xxlxor 34, 32, 34
+ xxsldwi 0, 32, 32, 3
+ vrlw 2, 2, 4
+ vadduwm 4, 2, 6
+ xxswapd 2, 34
+ xxlxor 35, 36, 35
+ xxsldwi 1, 36, 36, 1
+ vrlw 3, 3, 5
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 3, 35, 2
+ stxvd2x 0, 0, 8
+ xxswapd 3, 3
+ stxvd2x 3, 8, 5
+ lfdx 0, 0, 3
+ lfd 3, 8(3)
+ xxmrghd 34, 3, 0
+ xxlxor 0, 1, 34
+ xxswapd 0, 0
+ stxvd2x 0, 8, 10
+ lfd 0, 16(3)
+ lfd 1, 24(3)
+ li 3, -32
+ xxmrghd 34, 1, 0
+ xxlxor 0, 2, 34
+ xxswapd 0, 0
+ stxvd2x 0, 8, 6
+ lxvd2x 63, 1, 3
+ li 3, -48
+ ld 30, -16(1)
+ lxvd2x 62, 1, 3
+ li 3, -64
+ lxvd2x 61, 1, 3
+ li 3, -80
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+
+ .globl zfs_blake3_hash_many_sse2
+ .p2align 2
+ .type zfs_blake3_hash_many_sse2,@function
+zfs_blake3_hash_many_sse2:
+.Lfunc_begin2:
+ .cfi_startproc
+.Lfunc_gep2:
+ addis 2, 12, .TOC.-.Lfunc_gep2@ha
+ addi 2, 2, .TOC.-.Lfunc_gep2@l
+.Lfunc_lep2:
+ .localentry zfs_blake3_hash_many_sse2, .Lfunc_lep2-.Lfunc_gep2
+ mfocrf 12, 32
+ mflr 0
+ std 0, 16(1)
+ stw 12, 8(1)
+ stdu 1, -256(1)
+ .cfi_def_cfa_offset 256
+ .cfi_offset lr, 16
+ .cfi_offset r17, -120
+ .cfi_offset r18, -112
+ .cfi_offset r19, -104
+ .cfi_offset r20, -96
+ .cfi_offset r21, -88
+ .cfi_offset r22, -80
+ .cfi_offset r23, -72
+ .cfi_offset r24, -64
+ .cfi_offset r25, -56
+ .cfi_offset r26, -48
+ .cfi_offset r27, -40
+ .cfi_offset r28, -32
+ .cfi_offset r29, -24
+ .cfi_offset r30, -16
+ .cfi_offset cr2, 8
+ std 26, 208(1)
+ mr 26, 4
+ cmpldi 1, 4, 4
+ andi. 4, 8, 1
+ std 18, 144(1)
+ std 19, 152(1)
+ crmove 8, 1
+ ld 19, 360(1)
+ lwz 18, 352(1)
+ std 24, 192(1)
+ std 25, 200(1)
+ std 27, 216(1)
+ std 28, 224(1)
+ mr 24, 10
+ mr 28, 6
+ mr 27, 5
+ mr 25, 3
+ std 29, 232(1)
+ std 30, 240(1)
+ mr 30, 9
+ mr 29, 7
+ std 17, 136(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ blt 1, .LBB2_3
+ li 3, 0
+ li 4, 1
+ clrldi 23, 30, 32
+ isel 22, 4, 3, 8
+ clrldi 21, 24, 32
+ clrldi 20, 18, 32
+.LBB2_2:
+ mr 3, 25
+ mr 4, 27
+ mr 5, 28
+ mr 6, 29
+ mr 7, 22
+ mr 8, 23
+ mr 9, 21
+ mr 10, 20
+ std 19, 32(1)
+ bl blake3_hash4_sse2
+ addi 26, 26, -4
+ addi 3, 29, 4
+ addi 25, 25, 32
+ addi 19, 19, 128
+ cmpldi 26, 3
+ isel 29, 3, 29, 8
+ bgt 0, .LBB2_2
+.LBB2_3:
+ cmpldi 26, 0
+ beq 0, .LBB2_11
+ li 3, 0
+ li 4, 1
+ or 21, 24, 30
+ li 20, 16
+ addi 24, 1, 96
+ isel 22, 4, 3, 8
+.LBB2_5:
+ lxvd2x 0, 28, 20
+ ld 23, 0(25)
+ mr 17, 27
+ mr 3, 21
+ stxvd2x 0, 24, 20
+ lxvd2x 0, 0, 28
+ stxvd2x 0, 0, 24
+.LBB2_6:
+ cmpldi 17, 1
+ beq 0, .LBB2_8
+ cmpldi 17, 0
+ bne 0, .LBB2_9
+ b .LBB2_10
+.LBB2_8:
+ or 3, 3, 18
+.LBB2_9:
+ clrldi 7, 3, 56
+ mr 3, 24
+ mr 4, 23
+ li 5, 64
+ mr 6, 29
+ bl zfs_blake3_compress_in_place_sse2
+ addi 23, 23, 64
+ addi 17, 17, -1
+ mr 3, 30
+ b .LBB2_6
+.LBB2_10:
+ lxvd2x 0, 24, 20
+ addi 26, 26, -1
+ add 29, 29, 22
+ addi 25, 25, 8
+ cmpldi 26, 0
+ stxvd2x 0, 19, 20
+ lxvd2x 0, 0, 24
+ stxvd2x 0, 0, 19
+ addi 19, 19, 32
+ bne 0, .LBB2_5
+.LBB2_11:
+ ld 30, 240(1)
+ ld 29, 232(1)
+ ld 28, 224(1)
+ ld 27, 216(1)
+ ld 26, 208(1)
+ ld 25, 200(1)
+ ld 24, 192(1)
+ ld 23, 184(1)
+ ld 22, 176(1)
+ ld 21, 168(1)
+ ld 20, 160(1)
+ ld 19, 152(1)
+ ld 18, 144(1)
+ ld 17, 136(1)
+ addi 1, 1, 256
+ ld 0, 16(1)
+ lwz 12, 8(1)
+ mtocrf 32, 12
+ mtlr 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_0:
+ .quad 4294967296
+ .quad 12884901890
+.LCPI3_1:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI3_2:
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+.LCPI3_3:
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+.LCPI3_4:
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+.LCPI3_5:
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .text
+ .p2align 2
+ .type blake3_hash4_sse2,@function
+blake3_hash4_sse2:
+.Lfunc_begin3:
+ .cfi_startproc
+.Lfunc_gep3:
+ addis 2, 12, .TOC.-.Lfunc_gep3@ha
+ addi 2, 2, .TOC.-.Lfunc_gep3@l
+.Lfunc_lep3:
+ .localentry blake3_hash4_sse2, .Lfunc_lep3-.Lfunc_gep3
+ stdu 1, -400(1)
+ .cfi_def_cfa_offset 400
+ .cfi_offset r22, -152
+ .cfi_offset r23, -144
+ .cfi_offset r24, -136
+ .cfi_offset r25, -128
+ .cfi_offset r26, -120
+ .cfi_offset r27, -112
+ .cfi_offset r28, -104
+ .cfi_offset r29, -96
+ .cfi_offset r30, -88
+ .cfi_offset f23, -72
+ .cfi_offset f24, -64
+ .cfi_offset f25, -56
+ .cfi_offset f26, -48
+ .cfi_offset f27, -40
+ .cfi_offset f28, -32
+ .cfi_offset f29, -24
+ .cfi_offset f30, -16
+ .cfi_offset f31, -8
+ .cfi_offset v20, -352
+ .cfi_offset v21, -336
+ .cfi_offset v22, -320
+ .cfi_offset v23, -304
+ .cfi_offset v24, -288
+ .cfi_offset v25, -272
+ .cfi_offset v26, -256
+ .cfi_offset v27, -240
+ .cfi_offset v28, -224
+ .cfi_offset v29, -208
+ .cfi_offset v30, -192
+ .cfi_offset v31, -176
+ li 11, 48
+ li 0, 8
+ std 30, 312(1)
+ li 30, 12
+ li 12, 4
+ lfiwzx 0, 0, 5
+ stxvd2x 52, 1, 11
+ li 11, 64
+ lfiwzx 2, 5, 0
+ li 0, 20
+ lfiwzx 3, 5, 30
+ stxvd2x 53, 1, 11
+ li 11, 80
+ li 30, 24
+ lfiwzx 4, 5, 0
+ li 0, 28
+ stxvd2x 54, 1, 11
+ li 11, 96
+ lfiwzx 1, 5, 12
+ lfiwzx 6, 5, 30
+ xxspltw 45, 0, 1
+ cmpldi 4, 0
+ std 22, 248(1)
+ stxvd2x 55, 1, 11
+ li 11, 112
+ lfiwzx 7, 5, 0
+ xxspltw 40, 2, 1
+ std 23, 256(1)
+ xxspltw 38, 3, 1
+ xxspltw 50, 4, 1
+ std 24, 264(1)
+ std 25, 272(1)
+ std 26, 280(1)
+ xxspltw 54, 7, 1
+ std 27, 288(1)
+ std 28, 296(1)
+ std 29, 304(1)
+ stxvd2x 56, 1, 11
+ li 11, 128
+ stfd 23, 328(1)
+ stxvd2x 57, 1, 11
+ li 11, 144
+ stfd 24, 336(1)
+ stxvd2x 58, 1, 11
+ li 11, 160
+ stfd 25, 344(1)
+ stxvd2x 59, 1, 11
+ li 11, 176
+ xxspltw 59, 1, 1
+ stxvd2x 60, 1, 11
+ li 11, 192
+ stfd 26, 352(1)
+ stxvd2x 61, 1, 11
+ li 11, 208
+ stfd 27, 360(1)
+ stxvd2x 62, 1, 11
+ li 11, 224
+ xxspltw 62, 6, 1
+ stxvd2x 63, 1, 11
+ li 11, 16
+ stfd 28, 368(1)
+ lfiwzx 5, 5, 11
+ ld 5, 432(1)
+ stfd 29, 376(1)
+ stfd 30, 384(1)
+ stfd 31, 392(1)
+ xxspltw 61, 5, 1
+ beq 0, .LBB3_5
+ addis 30, 2, .LCPI3_0@toc@ha
+ neg 7, 7
+ xxleqv 34, 34, 34
+ addis 28, 2, .LCPI3_2@toc@ha
+ addis 27, 2, .LCPI3_3@toc@ha
+ addis 26, 2, .LCPI3_4@toc@ha
+ addis 25, 2, .LCPI3_5@toc@ha
+ ld 29, 24(3)
+ addi 0, 30, .LCPI3_0@toc@l
+ mtfprwz 1, 7
+ addis 7, 2, .LCPI3_1@toc@ha
+ ld 30, 16(3)
+ lxvd2x 0, 0, 0
+ mtfprwz 2, 6
+ rldicl 6, 6, 32, 32
+ addi 0, 7, .LCPI3_1@toc@l
+ ld 7, 8(3)
+ vslw 2, 2, 2
+ lvx 5, 0, 0
+ addi 0, 28, .LCPI3_2@toc@l
+ addi 28, 27, .LCPI3_3@toc@l
+ addi 27, 26, .LCPI3_4@toc@l
+ addi 26, 25, .LCPI3_5@toc@l
+ or 25, 9, 8
+ li 9, 0
+ xxspltw 36, 2, 1
+ xxswapd 35, 0
+ xxspltw 0, 1, 1
+ xxland 35, 0, 35
+ mtfprwz 0, 6
+ ld 6, 0(3)
+ addi 3, 3, -8
+ vadduwm 4, 3, 4
+ xxlor 35, 35, 34
+ xxlxor 34, 36, 34
+ xxlor 9, 36, 36
+ vspltisw 4, 4
+ vcmpgtsw 2, 3, 2
+ xxspltw 35, 0, 1
+ xxlor 10, 36, 36
+ vsubuwm 2, 3, 2
+ xxlor 11, 34, 34
+ lvx 2, 0, 0
+ li 0, 32
+ xxlor 12, 34, 34
+ lvx 2, 0, 28
+ li 28, 48
+ xxlor 13, 34, 34
+ lvx 2, 0, 27
+ li 27, 0
+ xxlor 31, 34, 34
+ lvx 2, 0, 26
+ xxlor 30, 34, 34
+.LBB3_2:
+ mr 26, 27
+ addi 27, 27, 1
+ xxlor 28, 40, 40
+ cmpld 27, 4
+ sldi 26, 26, 6
+ xxlor 24, 45, 45
+ iseleq 24, 10, 9
+ add 23, 6, 26
+ add 22, 30, 26
+ lxvd2x 0, 6, 26
+ lxvd2x 1, 7, 26
+ or 25, 24, 25
+ add 24, 7, 26
+ lxvd2x 2, 30, 26
+ lxvd2x 3, 29, 26
+ xxlor 29, 38, 38
+ lxvd2x 4, 23, 11
+ lxvd2x 6, 24, 11
+ clrlwi 25, 25, 24
+ lxvd2x 7, 22, 11
+ lxvd2x 8, 23, 0
+ mtfprd 5, 25
+ add 25, 29, 26
+ xxswapd 34, 0
+ lxvd2x 0, 25, 11
+ xxswapd 36, 1
+ xxswapd 33, 2
+ lxvd2x 1, 24, 0
+ lxvd2x 2, 22, 0
+ xxswapd 39, 3
+ xxswapd 32, 4
+ lxvd2x 3, 25, 0
+ lxvd2x 4, 23, 28
+ xxswapd 49, 6
+ xxswapd 51, 7
+ lxvd2x 6, 24, 28
+ xxswapd 58, 8
+ lxvd2x 7, 22, 28
+ lxvd2x 8, 25, 28
+ xxswapd 60, 0
+ mr 25, 3
+ xxswapd 57, 1
+ xxswapd 53, 2
+ xxswapd 52, 3
+ xxswapd 56, 4
+ xxswapd 55, 6
+ xxswapd 0, 5
+ xxswapd 40, 7
+ xxswapd 41, 8
+ mtctr 12
+.LBB3_3:
+ ldu 24, 8(25)
+ add 24, 24, 26
+ addi 24, 24, 256
+ dcbt 0, 24
+ bdnz .LBB3_3
+ vmrgew 3, 4, 2
+ vspltisw 31, 9
+ mr 25, 8
+ vmrglw 10, 4, 2
+ vspltisw 14, 10
+ vmrghw 6, 4, 2
+ xxspltw 0, 0, 3
+ vmrgew 4, 17, 0
+ vmrglw 11, 17, 0
+ vmrghw 16, 17, 0
+ vmrgew 0, 25, 26
+ vmrgew 13, 7, 1
+ vmrglw 2, 7, 1
+ vmrghw 7, 7, 1
+ xxlor 25, 36, 36
+ vmrgew 4, 28, 19
+ xxlor 26, 32, 32
+ vmrglw 0, 25, 26
+ vmrglw 1, 28, 19
+ xxmrgld 47, 34, 42
+ xxlor 44, 28, 28
+ vmrghw 25, 25, 26
+ xxlor 23, 36, 36
+ vmrghw 4, 28, 19
+ vspltisw 19, -16
+ xxlor 5, 32, 32
+ vmrgew 0, 20, 21
+ xxmrgld 34, 33, 43
+ vmrglw 28, 20, 21
+ vmrghw 21, 20, 21
+ vmrglw 20, 23, 24
+ vmrghw 26, 23, 24
+ vmrglw 17, 9, 8
+ xxlor 8, 32, 32
+ vmrgew 0, 23, 24
+ xxmrgld 56, 39, 38
+ vmrgew 23, 9, 8
+ xxlor 33, 24, 24
+ xxlor 2, 34, 34
+ vadduwm 11, 15, 1
+ xxmrgld 33, 36, 48
+ xxlor 6, 47, 47
+ xxlor 27, 32, 32
+ vmrghw 0, 9, 8
+ vspltisw 9, 12
+ vsubuwm 8, 31, 19
+ xxmrgld 51, 23, 25
+ vadduwm 31, 2, 12
+ xxlor 34, 10, 10
+ vadduwm 10, 14, 14
+ vslw 15, 2, 2
+ xxlor 34, 29, 29
+ vadduwm 14, 24, 27
+ xxlor 24, 48, 48
+ vadduwm 16, 1, 2
+ xxmrgld 34, 45, 35
+ vadduwm 31, 31, 30
+ xxmrghd 36, 36, 24
+ vadduwm 11, 11, 29
+ vadduwm 14, 14, 18
+ vadduwm 13, 16, 22
+ xxlxor 47, 63, 47
+ xxlor 1, 9, 9
+ xxlor 1, 11, 11
+ xxlxor 48, 43, 9
+ vadduwm 11, 11, 2
+ xxlor 7, 34, 34
+ xxmrghd 34, 39, 38
+ xxlxor 39, 46, 11
+ xxlor 1, 50, 50
+ xxlxor 50, 45, 0
+ vperm 15, 15, 15, 5
+ vperm 16, 16, 16, 5
+ vperm 7, 7, 7, 5
+ vperm 18, 18, 18, 5
+ xxlor 4, 33, 33
+ xxlor 33, 31, 31
+ vadduwm 14, 14, 2
+ xxlor 3, 34, 34
+ xxlor 34, 12, 12
+ xxlor 35, 13, 13
+ vadduwm 6, 15, 1
+ xxlor 33, 30, 30
+ vadduwm 2, 16, 2
+ vadduwm 3, 7, 3
+ vadduwm 12, 18, 1
+ xxlxor 59, 34, 61
+ xxlxor 61, 35, 1
+ xxlxor 33, 38, 62
+ xxlxor 62, 44, 54
+ vrlw 22, 27, 10
+ vrlw 29, 29, 10
+ vrlw 1, 1, 10
+ vrlw 30, 30, 10
+ vadduwm 31, 31, 19
+ vadduwm 13, 13, 4
+ vadduwm 11, 22, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 1, 31
+ vadduwm 13, 30, 13
+ vadduwm 9, 9, 9
+ xxlor 1, 36, 36
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 39
+ xxmrgld 39, 60, 5
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 28, 4, 9
+ xxmrgld 36, 53, 57
+ vrlw 15, 15, 9
+ xxmrghd 57, 53, 57
+ vrlw 18, 18, 9
+ vadduwm 14, 14, 4
+ xxlor 0, 36, 36
+ xxmrgld 36, 49, 52
+ vadduwm 2, 16, 2
+ xxmrgld 49, 8, 26
+ vadduwm 3, 28, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 54, 34, 54
+ xxlxor 61, 35, 61
+ xxlxor 33, 38, 33
+ xxlxor 62, 44, 62
+ vrlw 29, 29, 8
+ vrlw 20, 1, 8
+ xxmrgld 33, 55, 27
+ vrlw 30, 30, 8
+ vrlw 22, 22, 8
+ vadduwm 11, 11, 7
+ xxlor 5, 39, 39
+ xxmrgld 39, 32, 58
+ vadduwm 31, 31, 4
+ vadduwm 11, 29, 11
+ vadduwm 13, 13, 7
+ vadduwm 14, 20, 14
+ vadduwm 31, 30, 31
+ vadduwm 13, 22, 13
+ xxlor 28, 36, 36
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 60
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vadduwm 11, 11, 17
+ vmr 28, 17
+ xxmrghd 49, 32, 58
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 21, 4, 2
+ vadduwm 3, 15, 3
+ xxlxor 34, 38, 61
+ xxlxor 61, 44, 52
+ xxlxor 62, 53, 62
+ xxlxor 54, 35, 54
+ vrlw 20, 2, 10
+ vrlw 29, 29, 10
+ vrlw 0, 30, 10
+ vrlw 30, 22, 10
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 17
+ vadduwm 11, 20, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 24
+ xxlor 8, 56, 56
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 21
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 52
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 25, 51, 51
+ vmr 26, 17
+ xxlor 49, 3, 3
+ xxlor 52, 1, 1
+ xxlor 51, 2, 2
+ vadduwm 14, 14, 17
+ vadduwm 31, 31, 20
+ vadduwm 13, 13, 19
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 29, 39, 39
+ xxlor 59, 4, 4
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 53, 0, 0
+ xxlor 39, 6, 6
+ vadduwm 11, 11, 27
+ vadduwm 14, 14, 21
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 1
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 34, 7, 7
+ vadduwm 31, 31, 28
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 11, 11, 2
+ xxlor 34, 28, 28
+ vadduwm 13, 13, 26
+ vadduwm 14, 14, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 2, 58, 58
+ xxlor 39, 25, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 54, 29, 29
+ xxlor 58, 5, 5
+ vadduwm 11, 11, 25
+ vadduwm 14, 14, 7
+ vadduwm 31, 31, 22
+ vadduwm 13, 13, 26
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 21
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 20
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 0, 33, 33
+ xxlor 33, 8, 8
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vadduwm 11, 11, 19
+ vadduwm 14, 14, 2
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 22
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ vadduwm 11, 11, 27
+ vadduwm 14, 14, 28
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 31, 31, 25
+ vadduwm 13, 13, 26
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 3, 7, 7
+ vadduwm 11, 11, 7
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 33, 6, 6
+ xxlor 58, 2, 2
+ xxlor 39, 3, 3
+ vadduwm 14, 14, 1
+ vadduwm 31, 31, 26
+ vadduwm 13, 13, 7
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ xxlor 52, 0, 0
+ vadduwm 11, 11, 21
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 14, 14, 2
+ vadduwm 31, 31, 22
+ vadduwm 13, 13, 20
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 7, 49, 49
+ vmr 17, 2
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 54, 1, 1
+ xxlor 34, 7, 7
+ vadduwm 11, 11, 22
+ vadduwm 14, 14, 28
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 26
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 59, 25, 25
+ vadduwm 11, 11, 19
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 27
+ vadduwm 13, 13, 7
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vmr 2, 19
+ xxlor 0, 7, 7
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 1, 51, 51
+ xxlor 7, 39, 39
+ xxlor 51, 8, 8
+ xxlor 39, 5, 5
+ xxlor 34, 4, 4
+ vadduwm 11, 11, 1
+ vadduwm 14, 14, 19
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ xxlor 2, 53, 53
+ vmr 21, 28
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 53, 29, 29
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 28
+ vadduwm 31, 31, 26
+ vadduwm 13, 13, 21
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ vadduwm 11, 11, 20
+ xxlor 5, 52, 52
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 52, 2, 2
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 20
+ vadduwm 13, 13, 7
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ vadduwm 11, 11, 22
+ vadduwm 14, 14, 27
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 3, 29, 29
+ xxlor 4, 49, 49
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ vmr 17, 28
+ xxlor 2, 54, 54
+ xxlor 3, 34, 34
+ xxlor 34, 8, 8
+ xxlor 51, 0, 0
+ xxlor 60, 7, 7
+ xxlor 54, 1, 1
+ vadduwm 11, 11, 2
+ vadduwm 14, 14, 19
+ vadduwm 31, 31, 28
+ vadduwm 13, 13, 22
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 26
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 6, 39, 39
+ xxlor 39, 4, 4
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vadduwm 11, 11, 21
+ vadduwm 14, 14, 27
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 28
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 0, 49, 49
+ xxlor 49, 5, 5
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 1
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 22
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 34, 3, 3
+ xxlor 49, 2, 2
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ vadduwm 11, 11, 19
+ vadduwm 14, 14, 20
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 17
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 14, 14, 27
+ vadduwm 11, 11, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 27, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 57, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 59, 32
+ xxlor 39, 7, 7
+ vrlw 30, 30, 8
+ vrlw 25, 25, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 1, 58, 58
+ vmr 26, 19
+ vadduwm 19, 31, 7
+ xxlor 39, 6, 6
+ vadduwm 11, 30, 11
+ vadduwm 7, 13, 7
+ vadduwm 13, 25, 14
+ vadduwm 14, 29, 19
+ vadduwm 7, 0, 7
+ xxlxor 48, 43, 48
+ xxlxor 36, 45, 36
+ xxlxor 47, 46, 47
+ xxlxor 50, 39, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 51, 1, 1
+ vadduwm 13, 13, 1
+ vadduwm 11, 11, 19
+ vadduwm 19, 16, 27
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 63, 51, 62
+ xxlxor 62, 35, 57
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 31, 31, 10
+ vrlw 30, 30, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 33, 0, 0
+ vadduwm 7, 7, 2
+ vadduwm 14, 14, 1
+ vadduwm 11, 31, 11
+ vadduwm 13, 30, 13
+ vadduwm 14, 29, 14
+ vadduwm 7, 0, 7
+ xxlxor 48, 43, 48
+ xxlxor 36, 45, 36
+ xxlxor 47, 46, 47
+ xxlxor 50, 39, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 60, 8, 8
+ vadduwm 1, 11, 21
+ vadduwm 11, 13, 28
+ vadduwm 13, 16, 19
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 51, 45, 63
+ xxlxor 63, 35, 62
+ xxlxor 62, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 31, 31, 8
+ vrlw 30, 30, 8
+ vrlw 0, 0, 8
+ vrlw 19, 19, 8
+ vadduwm 14, 14, 26
+ vadduwm 7, 7, 17
+ vadduwm 1, 31, 1
+ vadduwm 11, 30, 11
+ vadduwm 14, 0, 14
+ vadduwm 7, 19, 7
+ xxlxor 50, 33, 50
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 39, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 34, 4, 4
+ vadduwm 14, 14, 22
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 13, 4, 13
+ vadduwm 3, 15, 3
+ xxlxor 49, 38, 63
+ xxlxor 63, 44, 62
+ xxlxor 32, 45, 32
+ xxlxor 51, 35, 51
+ vrlw 17, 17, 10
+ vrlw 31, 31, 10
+ vrlw 0, 0, 10
+ vrlw 10, 19, 10
+ vadduwm 11, 11, 2
+ xxlor 34, 5, 5
+ vadduwm 1, 1, 20
+ vadduwm 2, 7, 2
+ vadduwm 7, 31, 11
+ vadduwm 11, 0, 14
+ vadduwm 2, 10, 2
+ vadduwm 1, 17, 1
+ xxlxor 36, 43, 36
+ xxlxor 46, 34, 47
+ vrlw 4, 4, 9
+ vrlw 14, 14, 9
+ xxlxor 47, 33, 50
+ xxlxor 48, 39, 48
+ vrlw 15, 15, 9
+ vrlw 9, 16, 9
+ vadduwm 13, 4, 13
+ vadduwm 3, 14, 3
+ xxlxor 32, 45, 32
+ xxlxor 45, 45, 33
+ xxlxor 33, 35, 42
+ xxlxor 59, 35, 39
+ vadduwm 3, 15, 6
+ vadduwm 6, 9, 12
+ xxlxor 39, 35, 49
+ xxlxor 42, 38, 63
+ vrlw 1, 1, 8
+ vrlw 7, 7, 8
+ vrlw 10, 10, 8
+ vrlw 0, 0, 8
+ xxlxor 40, 35, 43
+ xxlxor 38, 38, 34
+ xxlxor 61, 33, 41
+ xxlxor 50, 39, 36
+ xxlxor 62, 42, 46
+ xxlxor 54, 32, 47
+ bne 0, .LBB3_2
+.LBB3_5:
+ vmrglw 2, 27, 13
+ li 3, 32
+ li 4, 48
+ vmrglw 4, 6, 8
+ vmrglw 0, 18, 29
+ vmrglw 1, 22, 30
+ vmrghw 3, 27, 13
+ vmrghw 5, 6, 8
+ vmrghw 6, 18, 29
+ vmrghw 7, 22, 30
+ xxmrgld 40, 36, 34
+ xxmrghd 34, 36, 34
+ xxmrgld 41, 33, 32
+ xxswapd 0, 40
+ xxmrgld 36, 37, 35
+ xxmrghd 35, 37, 35
+ xxmrghd 37, 33, 32
+ xxswapd 1, 41
+ xxmrgld 32, 39, 38
+ xxmrghd 33, 39, 38
+ xxswapd 2, 34
+ xxswapd 4, 36
+ xxswapd 3, 37
+ stxvd2x 0, 0, 5
+ xxswapd 5, 32
+ stxvd2x 1, 5, 11
+ xxswapd 0, 35
+ xxswapd 1, 33
+ stxvd2x 2, 5, 3
+ li 3, 64
+ stxvd2x 3, 5, 4
+ li 4, 80
+ stxvd2x 4, 5, 3
+ li 3, 96
+ stxvd2x 5, 5, 4
+ li 4, 112
+ stxvd2x 0, 5, 3
+ stxvd2x 1, 5, 4
+ li 3, 224
+ lxvd2x 63, 1, 3
+ li 3, 208
+ lfd 31, 392(1)
+ ld 30, 312(1)
+ ld 29, 304(1)
+ lxvd2x 62, 1, 3
+ li 3, 192
+ lfd 30, 384(1)
+ ld 28, 296(1)
+ ld 27, 288(1)
+ lxvd2x 61, 1, 3
+ li 3, 176
+ lfd 29, 376(1)
+ ld 26, 280(1)
+ ld 25, 272(1)
+ lxvd2x 60, 1, 3
+ li 3, 160
+ lfd 28, 368(1)
+ ld 24, 264(1)
+ ld 23, 256(1)
+ lxvd2x 59, 1, 3
+ li 3, 144
+ lfd 27, 360(1)
+ ld 22, 248(1)
+ lxvd2x 58, 1, 3
+ li 3, 128
+ lfd 26, 352(1)
+ lxvd2x 57, 1, 3
+ li 3, 112
+ lfd 25, 344(1)
+ lxvd2x 56, 1, 3
+ li 3, 96
+ lfd 24, 336(1)
+ lxvd2x 55, 1, 3
+ li 3, 80
+ lfd 23, 328(1)
+ lxvd2x 54, 1, 3
+ li 3, 64
+ lxvd2x 53, 1, 3
+ li 3, 48
+ lxvd2x 52, 1, 3
+ addi 1, 1, 400
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size blake3_hash4_sse2, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
new file mode 100644
index 000000000..a8b2627f1
--- /dev/null
+++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
@@ -0,0 +1,3064 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE4.1 -> POWER8 PPC64 Little Endian
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ .text
+ .abiversion 2
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 31
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 30
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 29
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 28
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI0_2:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI0_3:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI0_4:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_5:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI0_6:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+.LCPI0_7:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_8:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_9:
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_10:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+.LCPI0_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_12:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_13:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_14:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+.Lfunc_begin0:
+ .cfi_startproc
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry zfs_blake3_compress_in_place_sse41, .Lfunc_lep0-.Lfunc_gep0
+ li 8, -64
+ mtvsrd 34, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 13, -16
+ stxvd2x 60, 1, 8
+ li 8, -48
+ mtvsrd 35, 7
+ lfd 2, 16(4)
+ lfd 3, 24(4)
+ addis 7, 2, .LCPI0_0@toc@ha
+ stxvd2x 61, 1, 8
+ li 8, -32
+ mtvsrwz 36, 6
+ rldicl 6, 6, 32, 32
+ stxvd2x 62, 1, 8
+ li 8, -16
+ vmrghb 2, 3, 2
+ stxvd2x 63, 1, 8
+ mtvsrwz 35, 6
+ addi 6, 7, .LCPI0_0@toc@l
+ addis 7, 2, .LCPI0_2@toc@ha
+ lfd 1, 8(4)
+ xxmrghd 32, 3, 2
+ lvx 6, 0, 6
+ xxlxor 33, 33, 33
+ addis 6, 2, .LCPI0_1@toc@ha
+ addi 7, 7, .LCPI0_2@toc@l
+ vmrghw 3, 3, 4
+ addi 6, 6, .LCPI0_1@toc@l
+ vspltisw 14, 9
+ xxmrghd 37, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ vperm 2, 1, 2, 6
+ vpkudum 9, 0, 5
+ xxswapd 36, 0
+ xxswapd 38, 1
+ xxmrgld 34, 34, 35
+ lvx 3, 0, 7
+ addis 7, 2, .LCPI0_4@toc@ha
+ addi 7, 7, .LCPI0_4@toc@l
+ vadduwm 4, 9, 4
+ lvx 11, 0, 7
+ addis 7, 2, .LCPI0_6@toc@ha
+ addi 7, 7, .LCPI0_6@toc@l
+ vadduwm 7, 4, 6
+ lvx 4, 0, 6
+ addis 6, 2, .LCPI0_3@toc@ha
+ addi 6, 6, .LCPI0_3@toc@l
+ vperm 11, 0, 5, 11
+ lvx 0, 0, 7
+ li 7, 48
+ xxlxor 40, 39, 34
+ lvx 10, 0, 6
+ addis 6, 2, .LCPI0_5@toc@ha
+ lxvd2x 1, 4, 7
+ vcmpgtsb 2, 1, 4
+ addi 6, 6, .LCPI0_5@toc@l
+ vperm 4, 8, 8, 3
+ vspltisw 8, 10
+ xxlandc 44, 36, 34
+ vadduwm 4, 8, 8
+ vadduwm 8, 12, 10
+ xxlxor 37, 40, 38
+ vrlw 6, 5, 4
+ vadduwm 5, 7, 11
+ vadduwm 7, 6, 5
+ lvx 5, 0, 6
+ li 6, 32
+ lxvd2x 0, 4, 6
+ addis 4, 2, .LCPI0_7@toc@ha
+ addis 6, 2, .LCPI0_9@toc@ha
+ xxlxor 42, 39, 44
+ xxswapd 44, 1
+ addi 4, 4, .LCPI0_7@toc@l
+ addi 6, 6, .LCPI0_9@toc@l
+ vcmpgtsb 5, 1, 5
+ vperm 1, 10, 10, 0
+ xxswapd 42, 0
+ vpkudum 16, 12, 10
+ xxlandc 47, 33, 37
+ vsubuwm 1, 14, 13
+ lvx 14, 0, 4
+ addis 4, 2, .LCPI0_8@toc@ha
+ vadduwm 8, 15, 8
+ xxswapd 45, 47
+ addi 4, 4, .LCPI0_8@toc@l
+ vadduwm 7, 7, 16
+ xxsldwi 48, 48, 48, 1
+ xxlxor 38, 40, 38
+ xxsldwi 40, 40, 40, 3
+ xxsldwi 39, 39, 39, 1
+ vperm 14, 10, 12, 14
+ vrlw 6, 6, 1
+ vadduwm 7, 6, 7
+ xxlxor 45, 39, 45
+ vperm 13, 13, 13, 3
+ xxlandc 45, 45, 34
+ vadduwm 8, 13, 8
+ xxlxor 38, 40, 38
+ vrlw 10, 6, 4
+ vadduwm 6, 7, 14
+ vadduwm 7, 10, 6
+ xxlxor 38, 39, 45
+ vperm 12, 6, 6, 0
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI0_10@toc@ha
+ addi 4, 4, .LCPI0_10@toc@l
+ vperm 13, 11, 9, 6
+ xxlandc 44, 44, 37
+ vadduwm 15, 12, 8
+ vadduwm 7, 7, 13
+ xxsldwi 45, 45, 45, 3
+ xxlxor 40, 47, 42
+ xxsldwi 47, 47, 47, 1
+ xxsldwi 39, 39, 39, 3
+ vrlw 10, 8, 1
+ xxswapd 40, 44
+ vadduwm 17, 10, 7
+ lvx 7, 0, 4
+ addis 4, 2, .LCPI0_11@toc@ha
+ addi 4, 4, .LCPI0_11@toc@l
+ xxlxor 44, 49, 40
+ lvx 8, 0, 6
+ vperm 18, 9, 9, 7
+ lvx 9, 0, 4
+ addis 4, 2, .LCPI0_12@toc@ha
+ vperm 12, 12, 12, 3
+ addi 4, 4, .LCPI0_12@toc@l
+ vperm 19, 14, 16, 8
+ xxlandc 63, 44, 34
+ vperm 12, 19, 18, 9
+ vadduwm 15, 31, 15
+ xxlxor 42, 47, 42
+ vrlw 18, 10, 4
+ vadduwm 10, 17, 12
+ vadduwm 17, 18, 10
+ xxlxor 42, 49, 63
+ xxmrgld 63, 43, 46
+ xxsldwi 49, 49, 49, 1
+ vmrghw 14, 14, 11
+ vperm 19, 10, 10, 0
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI0_13@toc@ha
+ addi 4, 4, .LCPI0_13@toc@l
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI0_14@toc@ha
+ vperm 31, 16, 31, 10
+ addi 4, 4, .LCPI0_14@toc@l
+ vperm 14, 14, 16, 11
+ xxlandc 51, 51, 37
+ vadduwm 15, 19, 15
+ xxswapd 51, 51
+ vadduwm 17, 17, 31
+ xxlxor 50, 47, 50
+ xxsldwi 47, 47, 47, 3
+ vperm 30, 14, 31, 8
+ vrlw 18, 18, 1
+ vadduwm 17, 18, 17
+ xxlxor 51, 49, 51
+ vadduwm 17, 17, 14
+ vperm 19, 19, 19, 3
+ xxlandc 51, 51, 34
+ vadduwm 15, 19, 15
+ xxlxor 48, 47, 50
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 51
+ vperm 19, 12, 13, 6
+ vperm 18, 18, 18, 0
+ vperm 13, 13, 13, 7
+ vadduwm 17, 17, 19
+ xxlandc 50, 50, 37
+ xxsldwi 49, 49, 49, 3
+ vperm 13, 30, 13, 9
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxmrgld 62, 44, 46
+ vmrghw 12, 14, 12
+ xxlxor 48, 47, 48
+ xxsldwi 47, 47, 47, 1
+ vrlw 16, 16, 1
+ vperm 30, 31, 30, 10
+ vperm 12, 12, 31, 11
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 13
+ vperm 18, 18, 18, 3
+ vperm 31, 12, 30, 8
+ xxlandc 50, 50, 34
+ vadduwm 15, 18, 15
+ xxlxor 48, 47, 48
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ xxsldwi 49, 49, 49, 1
+ vperm 18, 18, 18, 0
+ vadduwm 17, 17, 30
+ xxlandc 50, 50, 37
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxlxor 48, 47, 48
+ xxsldwi 46, 47, 47, 3
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 12
+ vperm 18, 18, 18, 3
+ xxlandc 47, 50, 34
+ xxsldwi 50, 51, 51, 3
+ vadduwm 14, 15, 14
+ vperm 19, 13, 18, 6
+ xxlxor 48, 46, 48
+ vperm 18, 18, 18, 7
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vadduwm 17, 17, 19
+ vperm 15, 15, 15, 0
+ xxsldwi 49, 49, 49, 3
+ xxlandc 47, 47, 37
+ vadduwm 14, 15, 14
+ xxswapd 47, 47
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 29, 15, 14
+ vperm 14, 31, 18, 9
+ xxmrgld 50, 45, 44
+ xxlxor 48, 61, 48
+ vmrghw 12, 12, 13
+ vrlw 16, 16, 4
+ vperm 18, 30, 18, 10
+ vadduwm 17, 17, 14
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ xxsldwi 49, 49, 49, 1
+ vperm 15, 15, 15, 0
+ vadduwm 17, 17, 18
+ xxlandc 47, 47, 37
+ vadduwm 31, 15, 29
+ xxswapd 47, 47
+ xxlxor 48, 63, 48
+ xxsldwi 45, 63, 63, 3
+ vperm 31, 12, 30, 11
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 13, 15, 13
+ xxlxor 44, 45, 48
+ vadduwm 16, 17, 31
+ xxsldwi 49, 51, 51, 3
+ vrlw 12, 12, 4
+ vperm 19, 14, 17, 6
+ vadduwm 16, 12, 16
+ xxlxor 47, 48, 47
+ vperm 15, 15, 15, 0
+ xxlandc 47, 47, 37
+ vadduwm 13, 15, 13
+ xxswapd 47, 47
+ xxlxor 44, 45, 44
+ xxsldwi 45, 45, 45, 1
+ vrlw 30, 12, 1
+ vadduwm 12, 16, 19
+ xxsldwi 44, 44, 44, 3
+ vadduwm 16, 30, 12
+ xxlxor 44, 48, 47
+ vperm 15, 17, 17, 7
+ vperm 12, 12, 12, 3
+ vperm 17, 31, 18, 8
+ xxlandc 61, 44, 34
+ vperm 12, 17, 15, 9
+ vadduwm 13, 29, 13
+ xxlxor 47, 45, 62
+ xxmrgld 62, 46, 63
+ vmrghw 14, 31, 14
+ vrlw 15, 15, 4
+ vadduwm 16, 16, 12
+ vperm 30, 18, 30, 10
+ vperm 14, 14, 18, 11
+ xxsldwi 50, 51, 51, 3
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 61
+ xxsldwi 48, 48, 48, 1
+ vperm 19, 12, 18, 6
+ vperm 17, 17, 17, 0
+ vadduwm 16, 16, 30
+ xxmrgld 60, 44, 46
+ vmrghw 12, 14, 12
+ vperm 28, 30, 28, 10
+ xxlandc 49, 49, 37
+ vadduwm 13, 17, 13
+ xxswapd 49, 49
+ vperm 12, 12, 30, 11
+ xxlxor 47, 45, 47
+ xxsldwi 45, 45, 45, 3
+ vrlw 15, 15, 1
+ vperm 8, 12, 28, 8
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vadduwm 16, 16, 14
+ vperm 17, 17, 17, 3
+ xxlandc 49, 49, 34
+ vadduwm 13, 17, 13
+ xxlxor 47, 45, 47
+ vrlw 15, 15, 4
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vperm 17, 17, 17, 0
+ xxlandc 49, 49, 37
+ vadduwm 31, 17, 13
+ xxlxor 45, 63, 47
+ vrlw 15, 13, 1
+ vadduwm 13, 16, 19
+ xxswapd 48, 49
+ xxsldwi 51, 51, 51, 3
+ xxsldwi 45, 45, 45, 3
+ vadduwm 17, 15, 13
+ xxlxor 45, 49, 48
+ lvx 16, 0, 4
+ vperm 29, 13, 13, 3
+ vperm 13, 18, 18, 7
+ xxsldwi 50, 63, 63, 1
+ vperm 16, 14, 30, 16
+ vperm 7, 19, 19, 7
+ xxlandc 63, 61, 34
+ vadduwm 18, 31, 18
+ vperm 29, 16, 13, 9
+ xxlxor 47, 50, 47
+ vperm 6, 16, 19, 6
+ vrlw 15, 15, 4
+ vperm 7, 8, 7, 9
+ vadduwm 17, 17, 29
+ xxmrgld 41, 61, 44
+ vadduwm 17, 15, 17
+ vperm 9, 28, 9, 10
+ xxlxor 63, 49, 63
+ xxsldwi 49, 49, 49, 1
+ vperm 31, 31, 31, 0
+ vadduwm 17, 17, 28
+ xxlandc 63, 63, 37
+ vadduwm 18, 31, 18
+ xxswapd 63, 63
+ xxlxor 47, 50, 47
+ xxsldwi 46, 50, 50, 3
+ vrlw 15, 15, 1
+ vadduwm 17, 15, 17
+ xxlxor 63, 49, 63
+ vadduwm 17, 17, 12
+ vperm 31, 31, 31, 3
+ xxlandc 50, 63, 34
+ vadduwm 14, 18, 14
+ xxlxor 47, 46, 47
+ vrlw 15, 15, 4
+ vadduwm 17, 15, 17
+ xxlxor 50, 49, 50
+ vadduwm 6, 17, 6
+ vperm 18, 18, 18, 0
+ xxsldwi 38, 38, 38, 3
+ xxlandc 50, 50, 37
+ vadduwm 14, 18, 14
+ xxswapd 48, 50
+ xxlxor 47, 46, 47
+ xxsldwi 46, 46, 46, 1
+ vrlw 15, 15, 1
+ vadduwm 6, 15, 6
+ xxlxor 48, 38, 48
+ vadduwm 6, 6, 7
+ vperm 16, 16, 16, 3
+ xxlandc 48, 48, 34
+ vadduwm 14, 16, 14
+ xxlxor 40, 46, 47
+ vrlw 8, 8, 4
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 48
+ xxsldwi 38, 38, 38, 1
+ vperm 7, 7, 7, 0
+ vadduwm 6, 6, 9
+ xxlandc 39, 39, 37
+ vadduwm 14, 7, 14
+ xxswapd 39, 39
+ xxlxor 40, 46, 40
+ xxsldwi 41, 46, 46, 3
+ vrlw 8, 8, 1
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 39
+ vperm 3, 7, 7, 3
+ vmrghw 7, 12, 13
+ xxlandc 34, 35, 34
+ vperm 7, 7, 28, 11
+ vadduwm 3, 2, 9
+ xxlxor 40, 35, 40
+ vrlw 4, 8, 4
+ vadduwm 6, 6, 7
+ vadduwm 6, 4, 6
+ xxlxor 34, 38, 34
+ xxsldwi 0, 38, 38, 3
+ vperm 2, 2, 2, 0
+ xxlandc 34, 34, 37
+ vadduwm 3, 2, 3
+ xxswapd 34, 34
+ xxlxor 36, 35, 36
+ xxsldwi 1, 35, 35, 1
+ vrlw 4, 4, 1
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 1, 36, 34
+ stxvd2x 0, 0, 3
+ xxswapd 1, 1
+ stxvd2x 1, 3, 5
+ li 3, -16
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 31
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 30
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 29
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 28
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI1_2:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI1_3:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI1_4:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_5:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI1_6:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+.LCPI1_7:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_8:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_9:
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_10:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+.LCPI1_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_12:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_13:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_14:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .text
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+.Lfunc_begin1:
+ .cfi_startproc
+.Lfunc_gep1:
+ addis 2, 12, .TOC.-.Lfunc_gep1@ha
+ addi 2, 2, .TOC.-.Lfunc_gep1@l
+.Lfunc_lep1:
+ .localentry zfs_blake3_compress_xof_sse41, .Lfunc_lep1-.Lfunc_gep1
+ li 9, -64
+ mtvsrd 34, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 13, -16
+ addis 11, 2, .LCPI1_9@toc@ha
+ stxvd2x 60, 1, 9
+ li 9, -48
+ mtvsrd 35, 7
+ lfd 1, 8(4)
+ lfd 2, 16(4)
+ addis 7, 2, .LCPI1_0@toc@ha
+ stxvd2x 61, 1, 9
+ li 9, -32
+ mtvsrwz 36, 6
+ rldicl 6, 6, 32, 32
+ stxvd2x 62, 1, 9
+ li 9, -16
+ vmrghb 2, 3, 2
+ stxvd2x 63, 1, 9
+ mtvsrwz 35, 6
+ addi 6, 7, .LCPI1_0@toc@l
+ addis 7, 2, .LCPI1_2@toc@ha
+ lfd 3, 24(4)
+ xxmrghd 37, 1, 0
+ lvx 6, 0, 6
+ xxlxor 33, 33, 33
+ lxvd2x 0, 0, 3
+ addis 6, 2, .LCPI1_1@toc@ha
+ addi 7, 7, .LCPI1_2@toc@l
+ vmrghw 3, 3, 4
+ lxvd2x 1, 3, 5
+ addi 6, 6, .LCPI1_1@toc@l
+ vspltisw 14, 9
+ xxmrghd 32, 3, 2
+ xxswapd 36, 0
+ vperm 2, 1, 2, 6
+ xxswapd 38, 1
+ vpkudum 9, 0, 5
+ xxmrgld 34, 34, 35
+ lvx 3, 0, 7
+ addis 7, 2, .LCPI1_4@toc@ha
+ addi 7, 7, .LCPI1_4@toc@l
+ vadduwm 4, 9, 4
+ lvx 11, 0, 7
+ addis 7, 2, .LCPI1_6@toc@ha
+ addi 7, 7, .LCPI1_6@toc@l
+ vadduwm 7, 4, 6
+ lvx 4, 0, 6
+ addis 6, 2, .LCPI1_3@toc@ha
+ addi 6, 6, .LCPI1_3@toc@l
+ vperm 11, 0, 5, 11
+ lvx 0, 0, 7
+ li 7, 32
+ xxlxor 40, 39, 34
+ lvx 10, 0, 6
+ addis 6, 2, .LCPI1_5@toc@ha
+ lxvd2x 0, 4, 7
+ vcmpgtsb 2, 1, 4
+ addi 6, 6, .LCPI1_5@toc@l
+ vperm 4, 8, 8, 3
+ vspltisw 8, 10
+ xxlandc 44, 36, 34
+ vadduwm 4, 8, 8
+ vadduwm 8, 12, 10
+ xxlxor 37, 40, 38
+ vrlw 6, 5, 4
+ vadduwm 5, 7, 11
+ vadduwm 7, 6, 5
+ lvx 5, 0, 6
+ li 6, 48
+ lxvd2x 1, 4, 6
+ addis 4, 2, .LCPI1_7@toc@ha
+ xxlxor 42, 39, 44
+ addi 4, 4, .LCPI1_7@toc@l
+ vcmpgtsb 5, 1, 5
+ vperm 1, 10, 10, 0
+ xxswapd 42, 0
+ xxswapd 44, 1
+ vpkudum 16, 12, 10
+ xxlandc 47, 33, 37
+ vsubuwm 1, 14, 13
+ lvx 14, 0, 4
+ addis 4, 2, .LCPI1_8@toc@ha
+ vadduwm 8, 15, 8
+ xxswapd 45, 47
+ addi 4, 4, .LCPI1_8@toc@l
+ xxlxor 38, 40, 38
+ xxsldwi 40, 40, 40, 3
+ vadduwm 7, 7, 16
+ xxsldwi 48, 48, 48, 1
+ vrlw 6, 6, 1
+ xxsldwi 39, 39, 39, 1
+ vperm 14, 10, 12, 14
+ vadduwm 7, 6, 7
+ xxlxor 45, 39, 45
+ vperm 13, 13, 13, 3
+ xxlandc 45, 45, 34
+ vadduwm 8, 13, 8
+ xxlxor 38, 40, 38
+ vrlw 10, 6, 4
+ vadduwm 6, 7, 14
+ vadduwm 7, 10, 6
+ xxlxor 38, 39, 45
+ vperm 12, 6, 6, 0
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI1_10@toc@ha
+ addi 4, 4, .LCPI1_10@toc@l
+ vperm 13, 11, 9, 6
+ xxlandc 44, 44, 37
+ vadduwm 15, 12, 8
+ vadduwm 7, 7, 13
+ xxsldwi 45, 45, 45, 3
+ xxlxor 40, 47, 42
+ xxsldwi 47, 47, 47, 1
+ xxsldwi 39, 39, 39, 3
+ vrlw 10, 8, 1
+ xxswapd 40, 44
+ vadduwm 17, 10, 7
+ lvx 7, 0, 4
+ addi 4, 11, .LCPI1_9@toc@l
+ xxlxor 44, 49, 40
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI1_11@toc@ha
+ vperm 18, 9, 9, 7
+ addi 4, 4, .LCPI1_11@toc@l
+ vperm 12, 12, 12, 3
+ lvx 9, 0, 4
+ addis 4, 2, .LCPI1_12@toc@ha
+ vperm 19, 14, 16, 8
+ addi 4, 4, .LCPI1_12@toc@l
+ xxlandc 63, 44, 34
+ vperm 12, 19, 18, 9
+ vadduwm 15, 31, 15
+ xxlxor 42, 47, 42
+ vrlw 18, 10, 4
+ vadduwm 10, 17, 12
+ vadduwm 17, 18, 10
+ xxlxor 42, 49, 63
+ xxmrgld 63, 43, 46
+ xxsldwi 49, 49, 49, 1
+ vmrghw 14, 14, 11
+ vperm 19, 10, 10, 0
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI1_13@toc@ha
+ addi 4, 4, .LCPI1_13@toc@l
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI1_14@toc@ha
+ vperm 31, 16, 31, 10
+ addi 4, 4, .LCPI1_14@toc@l
+ vperm 14, 14, 16, 11
+ xxlandc 51, 51, 37
+ vadduwm 15, 19, 15
+ xxswapd 51, 51
+ vadduwm 17, 17, 31
+ xxlxor 50, 47, 50
+ xxsldwi 47, 47, 47, 3
+ vperm 30, 14, 31, 8
+ vrlw 18, 18, 1
+ vadduwm 17, 18, 17
+ xxlxor 51, 49, 51
+ vadduwm 17, 17, 14
+ vperm 19, 19, 19, 3
+ xxlandc 51, 51, 34
+ vadduwm 15, 19, 15
+ xxlxor 48, 47, 50
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 51
+ vperm 19, 12, 13, 6
+ vperm 18, 18, 18, 0
+ vperm 13, 13, 13, 7
+ vadduwm 17, 17, 19
+ xxlandc 50, 50, 37
+ xxsldwi 49, 49, 49, 3
+ vperm 13, 30, 13, 9
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxmrgld 62, 44, 46
+ vmrghw 12, 14, 12
+ xxlxor 48, 47, 48
+ xxsldwi 47, 47, 47, 1
+ vrlw 16, 16, 1
+ vperm 30, 31, 30, 10
+ vperm 12, 12, 31, 11
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 13
+ vperm 18, 18, 18, 3
+ vperm 31, 12, 30, 8
+ xxlandc 50, 50, 34
+ vadduwm 15, 18, 15
+ xxlxor 48, 47, 48
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ xxsldwi 49, 49, 49, 1
+ vperm 18, 18, 18, 0
+ vadduwm 17, 17, 30
+ xxlandc 50, 50, 37
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxlxor 48, 47, 48
+ xxsldwi 46, 47, 47, 3
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 12
+ vperm 18, 18, 18, 3
+ xxlandc 47, 50, 34
+ xxsldwi 50, 51, 51, 3
+ vadduwm 14, 15, 14
+ vperm 19, 13, 18, 6
+ xxlxor 48, 46, 48
+ vperm 18, 18, 18, 7
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vadduwm 17, 17, 19
+ vperm 15, 15, 15, 0
+ xxsldwi 49, 49, 49, 3
+ xxlandc 47, 47, 37
+ vadduwm 14, 15, 14
+ xxswapd 47, 47
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 29, 15, 14
+ vperm 14, 31, 18, 9
+ xxmrgld 50, 45, 44
+ xxlxor 48, 61, 48
+ vmrghw 12, 12, 13
+ vrlw 16, 16, 4
+ vperm 18, 30, 18, 10
+ vadduwm 17, 17, 14
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ xxsldwi 49, 49, 49, 1
+ vperm 15, 15, 15, 0
+ vadduwm 17, 17, 18
+ xxlandc 47, 47, 37
+ vadduwm 31, 15, 29
+ xxswapd 47, 47
+ xxlxor 48, 63, 48
+ xxsldwi 45, 63, 63, 3
+ vperm 31, 12, 30, 11
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 13, 15, 13
+ xxlxor 44, 45, 48
+ vadduwm 16, 17, 31
+ xxsldwi 49, 51, 51, 3
+ vrlw 12, 12, 4
+ vperm 19, 14, 17, 6
+ vadduwm 16, 12, 16
+ xxlxor 47, 48, 47
+ vperm 15, 15, 15, 0
+ xxlandc 47, 47, 37
+ vadduwm 13, 15, 13
+ xxswapd 47, 47
+ xxlxor 44, 45, 44
+ xxsldwi 45, 45, 45, 1
+ vrlw 30, 12, 1
+ vadduwm 12, 16, 19
+ xxsldwi 44, 44, 44, 3
+ vadduwm 16, 30, 12
+ xxlxor 44, 48, 47
+ vperm 15, 17, 17, 7
+ vperm 12, 12, 12, 3
+ vperm 17, 31, 18, 8
+ xxlandc 61, 44, 34
+ vperm 12, 17, 15, 9
+ vadduwm 13, 29, 13
+ xxlxor 47, 45, 62
+ xxmrgld 62, 46, 63
+ vmrghw 14, 31, 14
+ vrlw 15, 15, 4
+ vadduwm 16, 16, 12
+ vperm 30, 18, 30, 10
+ vperm 14, 14, 18, 11
+ xxsldwi 50, 51, 51, 3
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 61
+ xxsldwi 48, 48, 48, 1
+ vperm 19, 12, 18, 6
+ vperm 17, 17, 17, 0
+ vadduwm 16, 16, 30
+ xxmrgld 60, 44, 46
+ vmrghw 12, 14, 12
+ vperm 28, 30, 28, 10
+ xxlandc 49, 49, 37
+ vadduwm 13, 17, 13
+ xxswapd 49, 49
+ vperm 12, 12, 30, 11
+ xxlxor 47, 45, 47
+ xxsldwi 45, 45, 45, 3
+ vrlw 15, 15, 1
+ vperm 8, 12, 28, 8
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vadduwm 16, 16, 14
+ vperm 17, 17, 17, 3
+ xxlandc 49, 49, 34
+ vadduwm 13, 17, 13
+ xxlxor 47, 45, 47
+ vrlw 15, 15, 4
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vperm 17, 17, 17, 0
+ xxlandc 49, 49, 37
+ vadduwm 31, 17, 13
+ xxlxor 45, 63, 47
+ vrlw 15, 13, 1
+ vadduwm 13, 16, 19
+ xxswapd 48, 49
+ xxsldwi 51, 51, 51, 3
+ xxsldwi 45, 45, 45, 3
+ vadduwm 17, 15, 13
+ xxlxor 45, 49, 48
+ lvx 16, 0, 4
+ vperm 29, 13, 13, 3
+ vperm 13, 18, 18, 7
+ xxsldwi 50, 63, 63, 1
+ vperm 16, 14, 30, 16
+ vperm 7, 19, 19, 7
+ xxlandc 63, 61, 34
+ vadduwm 18, 31, 18
+ vperm 29, 16, 13, 9
+ xxlxor 47, 50, 47
+ vperm 6, 16, 19, 6
+ vrlw 15, 15, 4
+ vperm 7, 8, 7, 9
+ vadduwm 17, 17, 29
+ xxmrgld 41, 61, 44
+ vadduwm 17, 15, 17
+ vperm 9, 28, 9, 10
+ xxlxor 63, 49, 63
+ xxsldwi 49, 49, 49, 1
+ vperm 31, 31, 31, 0
+ vadduwm 17, 17, 28
+ xxlandc 63, 63, 37
+ vadduwm 18, 31, 18
+ xxswapd 63, 63
+ xxlxor 47, 50, 47
+ xxsldwi 46, 50, 50, 3
+ vrlw 15, 15, 1
+ vadduwm 17, 15, 17
+ xxlxor 63, 49, 63
+ vadduwm 17, 17, 12
+ vperm 31, 31, 31, 3
+ xxlandc 50, 63, 34
+ vadduwm 14, 18, 14
+ xxlxor 47, 46, 47
+ vrlw 15, 15, 4
+ vadduwm 17, 15, 17
+ xxlxor 50, 49, 50
+ vadduwm 6, 17, 6
+ vperm 18, 18, 18, 0
+ xxsldwi 38, 38, 38, 3
+ xxlandc 50, 50, 37
+ vadduwm 14, 18, 14
+ xxswapd 48, 50
+ xxlxor 47, 46, 47
+ xxsldwi 46, 46, 46, 1
+ vrlw 15, 15, 1
+ vadduwm 6, 15, 6
+ xxlxor 48, 38, 48
+ vadduwm 6, 6, 7
+ vperm 16, 16, 16, 3
+ xxlandc 48, 48, 34
+ vadduwm 14, 16, 14
+ xxlxor 40, 46, 47
+ vrlw 8, 8, 4
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 48
+ xxsldwi 38, 38, 38, 1
+ vperm 7, 7, 7, 0
+ vadduwm 6, 6, 9
+ xxlandc 39, 39, 37
+ vadduwm 14, 7, 14
+ xxswapd 39, 39
+ xxlxor 40, 46, 40
+ xxsldwi 41, 46, 46, 3
+ vrlw 8, 8, 1
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 39
+ vperm 3, 7, 7, 3
+ vmrghw 7, 12, 13
+ xxlandc 34, 35, 34
+ vperm 7, 7, 28, 11
+ vadduwm 3, 2, 9
+ xxlxor 40, 35, 40
+ vrlw 4, 8, 4
+ vadduwm 6, 6, 7
+ vadduwm 6, 4, 6
+ xxlxor 34, 38, 34
+ xxsldwi 0, 38, 38, 3
+ vperm 2, 2, 2, 0
+ xxlandc 34, 34, 37
+ vadduwm 3, 2, 3
+ xxswapd 34, 34
+ xxlxor 36, 35, 36
+ xxsldwi 1, 35, 35, 1
+ vrlw 4, 4, 1
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 2, 36, 34
+ stxvd2x 0, 0, 8
+ xxswapd 2, 2
+ stxvd2x 2, 8, 5
+ lfdx 0, 0, 3
+ lfd 2, 8(3)
+ xxmrghd 35, 2, 0
+ xxlxor 0, 1, 35
+ xxswapd 0, 0
+ stxvd2x 0, 8, 7
+ lfd 0, 16(3)
+ lfd 1, 24(3)
+ li 3, -16
+ xxmrghd 35, 1, 0
+ xxlxor 0, 34, 35
+ xxswapd 0, 0
+ stxvd2x 0, 8, 6
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+
+ .globl zfs_blake3_hash_many_sse41
+ .p2align 2
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
+.Lfunc_begin2:
+ .cfi_startproc
+.Lfunc_gep2:
+ addis 2, 12, .TOC.-.Lfunc_gep2@ha
+ addi 2, 2, .TOC.-.Lfunc_gep2@l
+.Lfunc_lep2:
+ .localentry zfs_blake3_hash_many_sse41, .Lfunc_lep2-.Lfunc_gep2
+ mfocrf 12, 32
+ mflr 0
+ std 0, 16(1)
+ stw 12, 8(1)
+ stdu 1, -256(1)
+ .cfi_def_cfa_offset 256
+ .cfi_offset lr, 16
+ .cfi_offset r17, -120
+ .cfi_offset r18, -112
+ .cfi_offset r19, -104
+ .cfi_offset r20, -96
+ .cfi_offset r21, -88
+ .cfi_offset r22, -80
+ .cfi_offset r23, -72
+ .cfi_offset r24, -64
+ .cfi_offset r25, -56
+ .cfi_offset r26, -48
+ .cfi_offset r27, -40
+ .cfi_offset r28, -32
+ .cfi_offset r29, -24
+ .cfi_offset r30, -16
+ .cfi_offset cr2, 8
+ std 26, 208(1)
+ mr 26, 4
+ cmpldi 1, 4, 4
+ andi. 4, 8, 1
+ std 18, 144(1)
+ std 19, 152(1)
+ crmove 8, 1
+ ld 19, 360(1)
+ lwz 18, 352(1)
+ std 24, 192(1)
+ std 25, 200(1)
+ std 27, 216(1)
+ std 28, 224(1)
+ mr 24, 10
+ mr 28, 6
+ mr 27, 5
+ mr 25, 3
+ std 29, 232(1)
+ std 30, 240(1)
+ mr 30, 9
+ mr 29, 7
+ std 17, 136(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ blt 1, .LBB2_3
+ li 3, 0
+ li 4, 1
+ clrldi 23, 30, 32
+ isel 22, 4, 3, 8
+ clrldi 21, 24, 32
+ clrldi 20, 18, 32
+.LBB2_2:
+ mr 3, 25
+ mr 4, 27
+ mr 5, 28
+ mr 6, 29
+ mr 7, 22
+ mr 8, 23
+ mr 9, 21
+ mr 10, 20
+ std 19, 32(1)
+ bl blake3_hash4_sse41
+ addi 26, 26, -4
+ addi 3, 29, 4
+ addi 25, 25, 32
+ addi 19, 19, 128
+ cmpldi 26, 3
+ isel 29, 3, 29, 8
+ bgt 0, .LBB2_2
+.LBB2_3:
+ cmpldi 26, 0
+ beq 0, .LBB2_11
+ li 3, 0
+ li 4, 1
+ or 21, 24, 30
+ li 20, 16
+ addi 24, 1, 96
+ isel 22, 4, 3, 8
+.LBB2_5:
+ lxvd2x 0, 28, 20
+ ld 23, 0(25)
+ mr 17, 27
+ mr 3, 21
+ stxvd2x 0, 24, 20
+ lxvd2x 0, 0, 28
+ stxvd2x 0, 0, 24
+.LBB2_6:
+ cmpldi 17, 1
+ beq 0, .LBB2_8
+ cmpldi 17, 0
+ bne 0, .LBB2_9
+ b .LBB2_10
+.LBB2_8:
+ or 3, 3, 18
+.LBB2_9:
+ clrldi 7, 3, 56
+ mr 3, 24
+ mr 4, 23
+ li 5, 64
+ mr 6, 29
+ bl zfs_blake3_compress_in_place_sse41
+ addi 23, 23, 64
+ addi 17, 17, -1
+ mr 3, 30
+ b .LBB2_6
+.LBB2_10:
+ lxvd2x 0, 24, 20
+ addi 26, 26, -1
+ add 29, 29, 22
+ addi 25, 25, 8
+ cmpldi 26, 0
+ stxvd2x 0, 19, 20
+ lxvd2x 0, 0, 24
+ stxvd2x 0, 0, 19
+ addi 19, 19, 32
+ bne 0, .LBB2_5
+.LBB2_11:
+ ld 30, 240(1)
+ ld 29, 232(1)
+ ld 28, 224(1)
+ ld 27, 216(1)
+ ld 26, 208(1)
+ ld 25, 200(1)
+ ld 24, 192(1)
+ ld 23, 184(1)
+ ld 22, 176(1)
+ ld 21, 168(1)
+ ld 20, 160(1)
+ ld 19, 152(1)
+ ld 18, 144(1)
+ ld 17, 136(1)
+ addi 1, 1, 256
+ ld 0, 16(1)
+ lwz 12, 8(1)
+ mtocrf 32, 12
+ mtlr 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_0:
+ .quad 4294967296
+ .quad 12884901890
+.LCPI3_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI3_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI3_3:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI3_4:
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+.LCPI3_5:
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+.LCPI3_6:
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+.LCPI3_7:
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+.LCPI3_8:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .text
+ .p2align 2
+ .type blake3_hash4_sse41,@function
+blake3_hash4_sse41:
+.Lfunc_begin3:
+ .cfi_startproc
+.Lfunc_gep3:
+ addis 2, 12, .TOC.-.Lfunc_gep3@ha
+ addi 2, 2, .TOC.-.Lfunc_gep3@l
+.Lfunc_lep3:
+ .localentry blake3_hash4_sse41, .Lfunc_lep3-.Lfunc_gep3
+ stdu 1, -416(1)
+ .cfi_def_cfa_offset 416
+ .cfi_offset r22, -176
+ .cfi_offset r23, -168
+ .cfi_offset r24, -160
+ .cfi_offset r25, -152
+ .cfi_offset r26, -144
+ .cfi_offset r27, -136
+ .cfi_offset r28, -128
+ .cfi_offset r29, -120
+ .cfi_offset r30, -112
+ .cfi_offset f20, -96
+ .cfi_offset f21, -88
+ .cfi_offset f22, -80
+ .cfi_offset f23, -72
+ .cfi_offset f24, -64
+ .cfi_offset f25, -56
+ .cfi_offset f26, -48
+ .cfi_offset f27, -40
+ .cfi_offset f28, -32
+ .cfi_offset f29, -24
+ .cfi_offset f30, -16
+ .cfi_offset f31, -8
+ .cfi_offset v20, -368
+ .cfi_offset v21, -352
+ .cfi_offset v22, -336
+ .cfi_offset v23, -320
+ .cfi_offset v24, -304
+ .cfi_offset v25, -288
+ .cfi_offset v26, -272
+ .cfi_offset v27, -256
+ .cfi_offset v28, -240
+ .cfi_offset v29, -224
+ .cfi_offset v30, -208
+ .cfi_offset v31, -192
+ li 11, 48
+ li 0, 8
+ std 30, 304(1)
+ li 30, 12
+ li 12, 4
+ lfiwzx 0, 0, 5
+ stxvd2x 52, 1, 11
+ li 11, 64
+ lfiwzx 2, 5, 0
+ li 0, 20
+ lfiwzx 3, 5, 30
+ stxvd2x 53, 1, 11
+ li 11, 80
+ li 30, 24
+ lfiwzx 4, 5, 0
+ li 0, 28
+ stxvd2x 54, 1, 11
+ li 11, 96
+ lfiwzx 1, 5, 12
+ lfiwzx 6, 5, 30
+ xxspltw 47, 0, 1
+ cmpldi 4, 0
+ std 22, 240(1)
+ stxvd2x 55, 1, 11
+ li 11, 112
+ lfiwzx 7, 5, 0
+ xxspltw 40, 2, 1
+ std 23, 248(1)
+ xxspltw 39, 3, 1
+ std 24, 256(1)
+ std 25, 264(1)
+ xxspltw 51, 1, 1
+ xxspltw 43, 6, 1
+ std 26, 272(1)
+ xxspltw 41, 7, 1
+ std 27, 280(1)
+ std 28, 288(1)
+ std 29, 296(1)
+ stxvd2x 56, 1, 11
+ li 11, 128
+ stfd 20, 320(1)
+ stxvd2x 57, 1, 11
+ li 11, 144
+ stfd 21, 328(1)
+ stxvd2x 58, 1, 11
+ li 11, 160
+ stfd 22, 336(1)
+ stxvd2x 59, 1, 11
+ li 11, 176
+ stfd 23, 344(1)
+ stxvd2x 60, 1, 11
+ li 11, 192
+ stfd 24, 352(1)
+ stxvd2x 61, 1, 11
+ li 11, 208
+ stfd 25, 360(1)
+ stxvd2x 62, 1, 11
+ li 11, 224
+ stfd 26, 368(1)
+ stxvd2x 63, 1, 11
+ li 11, 16
+ xxspltw 63, 4, 1
+ lfiwzx 5, 5, 11
+ ld 5, 448(1)
+ stfd 27, 376(1)
+ stfd 28, 384(1)
+ stfd 29, 392(1)
+ stfd 30, 400(1)
+ stfd 31, 408(1)
+ xxspltw 50, 5, 1
+ beq 0, .LBB3_5
+ addis 30, 2, .LCPI3_0@toc@ha
+ neg 7, 7
+ xxleqv 34, 34, 34
+ addis 28, 2, .LCPI3_5@toc@ha
+ addis 27, 2, .LCPI3_6@toc@ha
+ addis 26, 2, .LCPI3_7@toc@ha
+ addis 29, 2, .LCPI3_4@toc@ha
+ addis 25, 2, .LCPI3_8@toc@ha
+ addi 0, 30, .LCPI3_0@toc@l
+ mtfprwz 2, 7
+ addis 7, 2, .LCPI3_1@toc@ha
+ addis 30, 2, .LCPI3_3@toc@ha
+ addi 24, 29, .LCPI3_4@toc@l
+ ld 29, 24(3)
+ lxvd2x 1, 0, 0
+ mtfprwz 0, 6
+ rldicl 6, 6, 32, 32
+ addi 0, 30, .LCPI3_3@toc@l
+ ld 30, 16(3)
+ xxspltw 2, 2, 1
+ vslw 2, 2, 2
+ xxspltw 37, 0, 1
+ mtfprwz 0, 6
+ addi 6, 7, .LCPI3_1@toc@l
+ addis 7, 2, .LCPI3_2@toc@ha
+ xxswapd 35, 1
+ xxlxor 36, 36, 36
+ xxspltw 33, 0, 1
+ xxland 35, 2, 35
+ vadduwm 0, 3, 5
+ lvx 5, 0, 6
+ addi 6, 7, .LCPI3_2@toc@l
+ ld 7, 8(3)
+ xxlor 35, 35, 34
+ xxlxor 34, 32, 34
+ xxlor 9, 32, 32
+ lvx 0, 0, 6
+ ld 6, 0(3)
+ addi 3, 3, -8
+ vcmpgtsw 2, 3, 2
+ lvx 3, 0, 0
+ addi 0, 28, .LCPI3_5@toc@l
+ addi 28, 27, .LCPI3_6@toc@l
+ addi 27, 26, .LCPI3_7@toc@l
+ addi 26, 25, .LCPI3_8@toc@l
+ or 25, 9, 8
+ li 9, 0
+ vcmpgtsb 5, 4, 5
+ vcmpgtsb 0, 4, 0
+ xxlor 11, 35, 35
+ lvx 3, 0, 24
+ xxlor 12, 35, 35
+ vsubuwm 2, 1, 2
+ xxlnor 10, 37, 37
+ xxlor 13, 34, 34
+ lvx 2, 0, 0
+ li 0, 32
+ xxlnor 31, 32, 32
+ xxlor 30, 34, 34
+ lvx 2, 0, 28
+ li 28, 48
+ xxlor 29, 34, 34
+ lvx 2, 0, 27
+ li 27, 0
+ xxlor 28, 34, 34
+ lvx 2, 0, 26
+ xxlor 27, 34, 34
+.LBB3_2:
+ mr 26, 27
+ addi 27, 27, 1
+ xxlor 23, 39, 39
+ cmpld 27, 4
+ sldi 26, 26, 6
+ xxlor 24, 40, 40
+ iseleq 24, 10, 9
+ add 23, 6, 26
+ add 22, 30, 26
+ lxvd2x 0, 6, 26
+ lxvd2x 1, 7, 26
+ or 25, 24, 25
+ add 24, 7, 26
+ lxvd2x 2, 30, 26
+ lxvd2x 3, 29, 26
+ xxlor 26, 47, 47
+ lxvd2x 4, 23, 11
+ lxvd2x 6, 24, 11
+ clrlwi 25, 25, 24
+ xxlor 25, 51, 51
+ lxvd2x 7, 22, 11
+ lxvd2x 8, 23, 0
+ mtfprd 5, 25
+ add 25, 29, 26
+ xxswapd 34, 0
+ lxvd2x 0, 25, 11
+ xxswapd 38, 1
+ xxswapd 32, 2
+ lxvd2x 1, 24, 0
+ lxvd2x 2, 22, 0
+ xxswapd 40, 3
+ xxswapd 39, 4
+ lxvd2x 3, 25, 0
+ lxvd2x 4, 23, 28
+ xxswapd 60, 6
+ xxswapd 47, 7
+ lxvd2x 6, 24, 28
+ xxswapd 57, 8
+ lxvd2x 7, 22, 28
+ lxvd2x 8, 25, 28
+ xxswapd 58, 0
+ mr 25, 3
+ xxswapd 53, 1
+ xxswapd 56, 2
+ xxswapd 52, 3
+ xxswapd 55, 4
+ xxswapd 54, 6
+ xxswapd 0, 5
+ xxswapd 42, 7
+ xxswapd 48, 8
+ mtctr 12
+.LBB3_3:
+ ldu 24, 8(25)
+ add 24, 24, 26
+ addi 24, 24, 256
+ dcbt 0, 24
+ bdnz .LBB3_3
+ vmrgew 4, 28, 7
+ vspltisw 14, 9
+ mr 25, 8
+ vmrgew 27, 6, 2
+ vspltisw 17, 4
+ vmrglw 12, 6, 2
+ vspltisw 19, 10
+ vmrghw 30, 6, 2
+ xxspltw 0, 0, 3
+ vmrglw 2, 8, 0
+ vmrghw 13, 8, 0
+ xxlor 7, 36, 36
+ vmrgew 4, 21, 25
+ vmrglw 29, 28, 7
+ vmrghw 1, 28, 7
+ vmrglw 28, 26, 15
+ xxmrgld 37, 34, 44
+ vmrgew 7, 26, 15
+ vmrghw 15, 26, 15
+ xxlor 21, 36, 36
+ vmrglw 4, 21, 25
+ vmrghw 21, 21, 25
+ vmrglw 25, 20, 24
+ xxmrgld 34, 60, 61
+ vmrghw 26, 20, 24
+ xxlor 38, 26, 26
+ vmrgew 3, 8, 0
+ xxlor 5, 36, 36
+ vmrgew 4, 20, 24
+ vspltisw 24, -16
+ vmrglw 20, 22, 23
+ xxmrgld 57, 57, 5
+ vmrglw 8, 16, 10
+ vmrghw 0, 16, 10
+ vadduwm 12, 19, 19
+ xxlor 8, 37, 37
+ xxlor 20, 36, 36
+ vmrgew 4, 22, 23
+ vmrghw 23, 22, 23
+ xxmrgld 40, 40, 52
+ vmrgew 22, 16, 10
+ vsubuwm 10, 14, 24
+ vslw 14, 17, 17
+ vadduwm 17, 5, 6
+ xxmrgld 37, 47, 33
+ xxlor 22, 36, 36
+ xxmrgld 36, 45, 62
+ xxlor 38, 25, 25
+ xxlor 2, 34, 34
+ vadduwm 19, 4, 6
+ xxmrgld 38, 39, 7
+ xxlor 3, 36, 36
+ xxmrghd 39, 47, 33
+ xxlor 36, 24, 24
+ xxmrgld 33, 58, 53
+ vadduwm 17, 17, 18
+ vadduwm 29, 2, 4
+ xxmrgld 36, 35, 59
+ xxlor 34, 23, 23
+ xxmrghd 35, 45, 62
+ xxlor 1, 9, 9
+ vadduwm 28, 5, 2
+ xxlor 1, 13, 13
+ vadduwm 19, 19, 31
+ vadduwm 24, 29, 11
+ vadduwm 28, 28, 9
+ xxlxor 61, 49, 9
+ xxlor 1, 41, 41
+ xxlor 41, 11, 11
+ xxlxor 34, 51, 13
+ vperm 29, 29, 29, 9
+ xxlxor 46, 56, 46
+ vperm 2, 2, 2, 9
+ xxlxor 59, 60, 0
+ vperm 14, 14, 14, 9
+ vperm 30, 27, 27, 9
+ vadduwm 19, 19, 3
+ xxlor 4, 35, 35
+ xxland 61, 61, 10
+ xxlor 35, 12, 12
+ xxland 34, 34, 10
+ vadduwm 27, 29, 3
+ xxlor 35, 30, 30
+ vadduwm 17, 17, 4
+ xxlor 26, 36, 36
+ xxland 46, 46, 10
+ vadduwm 3, 2, 3
+ xxlor 36, 29, 29
+ xxland 62, 62, 10
+ xxlxor 45, 59, 50
+ xxlxor 50, 35, 63
+ vadduwm 31, 14, 4
+ xxlor 36, 28, 28
+ xxlor 6, 37, 37
+ vadduwm 16, 30, 4
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 1
+ vrlw 4, 13, 12
+ vrlw 18, 18, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 15, 24, 6
+ vadduwm 28, 28, 7
+ vadduwm 17, 4, 17
+ vadduwm 19, 18, 19
+ vadduwm 15, 11, 15
+ vadduwm 28, 5, 28
+ xxlor 25, 38, 38
+ xxlxor 61, 49, 61
+ xxlxor 34, 51, 34
+ xxlxor 46, 47, 46
+ xxlxor 62, 60, 62
+ xxlor 38, 27, 27
+ vadduwm 19, 19, 1
+ vperm 29, 29, 29, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 14, 14, 6
+ vperm 30, 30, 30, 6
+ xxlor 5, 33, 33
+ vadduwm 17, 17, 25
+ xxland 61, 61, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ xxland 62, 62, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 2, 3
+ vadduwm 31, 24, 31
+ vadduwm 16, 30, 16
+ xxlxor 36, 59, 36
+ xxlxor 50, 35, 50
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 18, 10
+ xxmrgld 50, 32, 55
+ vrlw 11, 11, 10
+ xxmrghd 55, 32, 55
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 15, 15, 8
+ vadduwm 28, 28, 18
+ vadduwm 17, 1, 17
+ vadduwm 19, 11, 19
+ vadduwm 15, 5, 15
+ vadduwm 28, 4, 28
+ xxlor 7, 57, 57
+ xxlxor 62, 49, 62
+ xxlxor 61, 51, 61
+ xxlxor 57, 47, 34
+ xxlxor 34, 60, 56
+ vperm 24, 30, 30, 9
+ xxmrgld 62, 20, 21
+ vperm 29, 29, 29, 9
+ vperm 25, 25, 25, 9
+ vperm 2, 2, 2, 9
+ vmr 14, 8
+ xxmrghd 40, 58, 53
+ xxmrgld 58, 54, 22
+ vadduwm 17, 17, 30
+ xxland 56, 56, 10
+ vadduwm 21, 19, 8
+ xxland 61, 61, 10
+ xxland 51, 57, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vadduwm 0, 15, 26
+ vadduwm 15, 28, 23
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 21
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vmr 13, 8
+ xxlor 53, 3, 3
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 52, 4, 4
+ xxlor 40, 2, 2
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 8
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ vmr 25, 26
+ xxlor 3, 39, 39
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 54, 6, 6
+ xxlor 58, 5, 5
+ xxlor 39, 8, 8
+ vadduwm 17, 17, 22
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 39, 26, 26
+ vadduwm 28, 28, 14
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 7
+ vadduwm 0, 0, 30
+ vadduwm 15, 15, 23
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 24, 55, 55
+ vadduwm 17, 17, 13
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vmr 23, 13
+ xxlor 45, 25, 25
+ xxlor 39, 7, 7
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 7
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 2, 46, 46
+ xxlor 46, 3, 3
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vadduwm 17, 17, 20
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 25
+ vadduwm 15, 15, 14
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 52, 2, 2
+ vadduwm 17, 17, 8
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 21
+ vadduwm 15, 15, 18
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ vadduwm 17, 17, 22
+ vadduwm 28, 28, 30
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 0, 0, 23
+ vadduwm 15, 15, 7
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 5, 4, 4
+ xxlor 4, 58, 58
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 39, 8, 8
+ xxlor 54, 24, 24
+ xxlor 58, 26, 26
+ vadduwm 17, 17, 13
+ vadduwm 28, 28, 7
+ vadduwm 0, 0, 22
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 3, 53, 53
+ xxlor 53, 4, 4
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 2, 55, 55
+ vmr 23, 18
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 50, 5, 5
+ vadduwm 17, 17, 14
+ vadduwm 28, 28, 30
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 22
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 25, 40, 40
+ vmr 8, 13
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ xxlor 45, 25, 25
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 13
+ xxlor 45, 2, 2
+ vadduwm 0, 0, 8
+ vadduwm 28, 28, 13
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 4, 57, 57
+ xxlor 26, 46, 46
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 8, 62, 62
+ xxlor 57, 3, 3
+ xxlor 46, 7, 7
+ xxlor 62, 6, 6
+ vadduwm 17, 17, 7
+ vadduwm 28, 28, 25
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 30
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vadduwm 17, 17, 20
+ xxlor 3, 52, 52
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 52, 8, 8
+ vadduwm 0, 0, 22
+ vadduwm 28, 28, 20
+ vadduwm 15, 15, 23
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 6, 55, 55
+ xxlor 55, 4, 4
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 17, 17, 23
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 21
+ vadduwm 15, 15, 14
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 4, 53, 53
+ xxlor 53, 26, 26
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 8
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 30
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 5, 25, 25
+ xxlor 2, 58, 58
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vmr 22, 26
+ vadduwm 0, 0, 26
+ xxlor 58, 5, 5
+ vadduwm 17, 17, 25
+ vadduwm 28, 28, 18
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 7, 24, 24
+ xxlor 8, 57, 57
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 57, 7, 7
+ vadduwm 17, 17, 20
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 5, 52, 52
+ xxlor 23, 45, 45
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 52, 6, 6
+ vadduwm 28, 28, 8
+ vmr 13, 8
+ xxlor 40, 3, 3
+ vadduwm 17, 17, 20
+ vadduwm 0, 0, 8
+ vadduwm 15, 15, 22
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 25, 39, 39
+ vmr 7, 30
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vmr 30, 18
+ xxlor 24, 46, 46
+ xxlor 46, 25, 25
+ xxlor 50, 8, 8
+ vadduwm 17, 17, 23
+ vadduwm 28, 28, 14
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 6, 58, 58
+ xxlor 58, 4, 4
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vadduwm 17, 17, 30
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 21
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 40, 23, 23
+ vadduwm 13, 28, 13
+ vadduwm 8, 17, 8
+ xxland 49, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 17, 31
+ vadduwm 16, 29, 16
+ vadduwm 28, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 60, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 2, 55, 55
+ vmr 23, 30
+ xxlor 62, 24, 24
+ vadduwm 0, 0, 22
+ vadduwm 15, 15, 30
+ vadduwm 8, 4, 8
+ vadduwm 13, 1, 13
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 40, 61
+ xxlxor 51, 45, 51
+ xxlxor 34, 32, 34
+ xxlxor 49, 47, 49
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 17, 17, 17, 9
+ vadduwm 13, 13, 14
+ xxlor 46, 5, 5
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 49, 49, 10
+ vadduwm 28, 29, 28
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 17, 16
+ xxlxor 36, 60, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 8, 8, 25
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 7
+ vadduwm 8, 4, 8
+ vadduwm 13, 1, 13
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 62, 40, 61
+ xxlxor 51, 45, 51
+ xxlxor 34, 32, 34
+ xxlxor 49, 47, 49
+ vperm 30, 30, 30, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 17, 17, 17, 6
+ vadduwm 29, 8, 20
+ vadduwm 8, 13, 18
+ xxland 45, 62, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 49, 49, 31
+ vadduwm 30, 13, 28
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 17, 16
+ xxlxor 36, 62, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 0, 0, 23
+ vadduwm 7, 15, 21
+ vadduwm 29, 1, 29
+ vadduwm 8, 11, 8
+ vadduwm 0, 5, 0
+ vadduwm 7, 4, 7
+ xxlxor 47, 61, 49
+ xxlxor 45, 40, 45
+ xxlxor 49, 32, 51
+ xxlxor 34, 39, 34
+ vperm 15, 15, 15, 9
+ vperm 13, 13, 13, 9
+ vperm 17, 17, 17, 9
+ vperm 2, 2, 2, 9
+ xxlor 46, 3, 3
+ vadduwm 9, 29, 26
+ vadduwm 8, 8, 14
+ xxland 46, 47, 10
+ xxland 45, 45, 10
+ xxland 47, 49, 10
+ xxland 34, 34, 10
+ vadduwm 17, 14, 31
+ vadduwm 16, 13, 16
+ vadduwm 18, 15, 30
+ vadduwm 3, 2, 3
+ xxlxor 33, 49, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 50, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 44, 6, 6
+ xxlor 0, 10, 10
+ vadduwm 0, 0, 12
+ xxlor 44, 2, 2
+ vadduwm 9, 1, 9
+ vadduwm 7, 7, 12
+ vadduwm 8, 11, 8
+ vadduwm 7, 4, 7
+ vadduwm 0, 5, 0
+ xxlxor 34, 39, 34
+ xxlxor 44, 32, 47
+ vperm 2, 2, 2, 6
+ xxlxor 46, 41, 46
+ xxlxor 45, 40, 45
+ vperm 12, 12, 12, 6
+ vperm 14, 14, 14, 6
+ vperm 13, 13, 13, 6
+ xxland 34, 34, 31
+ xxlor 1, 31, 31
+ vadduwm 3, 2, 3
+ xxland 44, 44, 31
+ xxlxor 36, 35, 36
+ xxlxor 51, 35, 40
+ xxland 35, 46, 31
+ xxland 38, 45, 31
+ vadduwm 15, 12, 18
+ vadduwm 8, 3, 17
+ vadduwm 13, 6, 16
+ xxlxor 37, 47, 37
+ xxlxor 33, 40, 33
+ xxlxor 43, 45, 43
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlxor 47, 47, 41
+ xxlxor 40, 40, 32
+ xxlxor 39, 45, 39
+ xxlxor 50, 36, 38
+ xxlxor 63, 33, 44
+ xxlxor 43, 43, 34
+ xxlxor 41, 37, 35
+ bne 0, .LBB3_2
+.LBB3_5:
+ vmrglw 2, 19, 15
+ li 3, 32
+ li 4, 48
+ vmrglw 4, 7, 8
+ vmrglw 0, 31, 18
+ vmrglw 1, 9, 11
+ vmrghw 3, 19, 15
+ vmrghw 5, 7, 8
+ vmrghw 6, 31, 18
+ vmrghw 7, 9, 11
+ xxmrgld 40, 36, 34
+ xxmrghd 34, 36, 34
+ xxmrgld 41, 33, 32
+ xxswapd 0, 40
+ xxmrgld 36, 37, 35
+ xxmrghd 35, 37, 35
+ xxmrghd 37, 33, 32
+ xxswapd 1, 41
+ xxmrgld 32, 39, 38
+ xxmrghd 33, 39, 38
+ xxswapd 2, 34
+ xxswapd 4, 36
+ xxswapd 3, 37
+ stxvd2x 0, 0, 5
+ xxswapd 5, 32
+ stxvd2x 1, 5, 11
+ xxswapd 0, 35
+ xxswapd 1, 33
+ stxvd2x 2, 5, 3
+ li 3, 64
+ stxvd2x 3, 5, 4
+ li 4, 80
+ stxvd2x 4, 5, 3
+ li 3, 96
+ stxvd2x 5, 5, 4
+ li 4, 112
+ stxvd2x 0, 5, 3
+ stxvd2x 1, 5, 4
+ li 3, 224
+ lxvd2x 63, 1, 3
+ li 3, 208
+ lfd 31, 408(1)
+ ld 30, 304(1)
+ ld 29, 296(1)
+ lxvd2x 62, 1, 3
+ li 3, 192
+ lfd 30, 400(1)
+ ld 28, 288(1)
+ ld 27, 280(1)
+ lxvd2x 61, 1, 3
+ li 3, 176
+ lfd 29, 392(1)
+ ld 26, 272(1)
+ ld 25, 264(1)
+ lxvd2x 60, 1, 3
+ li 3, 160
+ lfd 28, 384(1)
+ ld 24, 256(1)
+ ld 23, 248(1)
+ lxvd2x 59, 1, 3
+ li 3, 144
+ lfd 27, 376(1)
+ ld 22, 240(1)
+ lxvd2x 58, 1, 3
+ li 3, 128
+ lfd 26, 368(1)
+ lxvd2x 57, 1, 3
+ li 3, 112
+ lfd 25, 360(1)
+ lxvd2x 56, 1, 3
+ li 3, 96
+ lfd 24, 352(1)
+ lxvd2x 55, 1, 3
+ li 3, 80
+ lfd 23, 344(1)
+ lxvd2x 54, 1, 3
+ li 3, 64
+ lfd 22, 336(1)
+ lxvd2x 53, 1, 3
+ li 3, 48
+ lfd 21, 328(1)
+ lxvd2x 52, 1, 3
+ lfd 20, 320(1)
+ addi 1, 1, 416
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size blake3_hash4_sse41, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S
new file mode 100644
index 000000000..b15d8fc77
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S
@@ -0,0 +1,1845 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_AVX2)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_avx2
+.text
+
+.type zfs_blake3_hash_many_avx2,@function
+.p2align 6
+zfs_blake3_hash_many_avx2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 680
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ vmovd xmm0, r9d
+ vpbroadcastd ymm0, xmm0
+ vmovdqa ymmword ptr [rsp+0x280], ymm0
+ vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
+ vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
+ vmovdqa ymmword ptr [rsp+0x220], ymm2
+ vmovd xmm2, r8d
+ vpbroadcastd ymm2, xmm2
+ vpaddd ymm2, ymm2, ymm1
+ vmovdqa ymmword ptr [rsp+0x240], ymm2
+ vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm1, ymm2
+ shr r8, 32
+ vmovd xmm3, r8d
+ vpbroadcastd ymm3, xmm3
+ vpsubd ymm3, ymm3, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm3
+ shl rdx, 6
+ mov qword ptr [rsp+0x2A0], rdx
+ cmp rsi, 8
+ jc 3f
+2:
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x2A0]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x200], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x20], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x40], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x60], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x80], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0xA0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0xC0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0xE0], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x100], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x120], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x140], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x160], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x180], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x1A0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x1C0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x1E0], ymm11
+ vpbroadcastd ymm15, dword ptr [rsp+0x200]
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
+ vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
+ vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpxor ymm15, ymm3, ymm15
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+ vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+ vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+ vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp+0x220]
+ vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
+ vmovdqa ymmword ptr [rsp+0x240], ymm1
+ vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm0, ymm2
+ vmovdqa ymm0, ymmword ptr [rsp+0x260]
+ vpsubd ymm2, ymm0, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm2
+ add rdi, 64
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 8
+ cmp rsi, 8
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x2A0]
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ test rsi, 0x4
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovdqa ymm8, ymm0
+ vmovdqa ymm9, ymm1
+ vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
+ vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
+ vpunpckldq ymm14, ymm12, ymm13
+ vpunpckhdq ymm15, ymm12, ymm13
+ vpermq ymm14, ymm14, 0x50
+ vpermq ymm15, ymm15, 0x50
+ vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpblendd ymm14, ymm14, ymm12, 0x44
+ vpblendd ymm15, ymm15, ymm12, 0x44
+ vmovdqa ymmword ptr [rsp], ymm14
+ vmovdqa ymmword ptr [rsp+0x20], ymm15
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm2, ymm3, 136
+ vshufps ymm5, ymm2, ymm3, 221
+ vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm2, ymm3, 136
+ vshufps ymm7, ymm2, ymm3, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+ vshufps ymm12, ymm10, ymm11, 136
+ vshufps ymm13, ymm10, ymm11, 221
+ vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+ vshufps ymm14, ymm10, ymm11, 136
+ vshufps ymm15, ymm10, ymm11, 221
+ vpshufd ymm14, ymm14, 0x93
+ vpshufd ymm15, ymm15, 0x93
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ vpbroadcastd ymm2, dword ptr [rsp+0x200]
+ vmovdqa ymm3, ymmword ptr [rsp]
+ vmovdqa ymm11, ymmword ptr [rsp+0x20]
+ vpblendd ymm3, ymm3, ymm2, 0x88
+ vpblendd ymm11, ymm11, ymm2, 0x88
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa ymm10, ymm2
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm8, ymm8, ymm12
+ vmovdqa ymmword ptr [rsp+0x40], ymm4
+ nop
+ vmovdqa ymmword ptr [rsp+0x60], ymm12
+ nop
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vmovdqa ymmword ptr [rsp+0x80], ymm5
+ vmovdqa ymmword ptr [rsp+0xA0], ymm13
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm8, ymm8, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpshufd ymm10, ymm10, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm8, ymm8, ymm15
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm8, ymm8, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ vpshufd ymm10, ymm10, 0x93
+ dec al
+ je 9f
+ vmovdqa ymm4, ymmword ptr [rsp+0x40]
+ vmovdqa ymm5, ymmword ptr [rsp+0x80]
+ vshufps ymm12, ymm4, ymm5, 214
+ vpshufd ymm13, ymm4, 0x0F
+ vpshufd ymm4, ymm12, 0x39
+ vshufps ymm12, ymm6, ymm7, 250
+ vpblendd ymm13, ymm13, ymm12, 0xAA
+ vpunpcklqdq ymm12, ymm7, ymm5
+ vpblendd ymm12, ymm12, ymm6, 0x88
+ vpshufd ymm12, ymm12, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymmword ptr [rsp+0x40], ymm13
+ vmovdqa ymmword ptr [rsp+0x80], ymm12
+ vmovdqa ymm12, ymmword ptr [rsp+0x60]
+ vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+ vshufps ymm5, ymm12, ymm13, 214
+ vpshufd ymm6, ymm12, 0x0F
+ vpshufd ymm12, ymm5, 0x39
+ vshufps ymm5, ymm14, ymm15, 250
+ vpblendd ymm6, ymm6, ymm5, 0xAA
+ vpunpcklqdq ymm5, ymm15, ymm13
+ vpblendd ymm5, ymm5, ymm14, 0x88
+ vpshufd ymm5, ymm5, 0x78
+ vpunpckhdq ymm13, ymm13, ymm15
+ vpunpckldq ymm14, ymm14, ymm13
+ vpshufd ymm15, ymm14, 0x1E
+ vmovdqa ymm13, ymm6
+ vmovdqa ymm14, ymm5
+ vmovdqa ymm5, ymmword ptr [rsp+0x40]
+ vmovdqa ymm6, ymmword ptr [rsp+0x80]
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ vpxor ymm8, ymm8, ymm10
+ vpxor ymm9, ymm9, ymm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqu xmmword ptr [rbx+0x40], xmm8
+ vmovdqu xmmword ptr [rbx+0x50], xmm9
+ vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+ vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+ vmovaps xmm8, xmmword ptr [rsp+0x280]
+ vmovaps xmm0, xmmword ptr [rsp+0x240]
+ vmovaps xmm1, xmmword ptr [rsp+0x250]
+ vmovaps xmm2, xmmword ptr [rsp+0x260]
+ vmovaps xmm3, xmmword ptr [rsp+0x270]
+ vblendvps xmm0, xmm0, xmm1, xmm8
+ vblendvps xmm2, xmm2, xmm3, xmm8
+ vmovaps xmmword ptr [rsp+0x240], xmm0
+ vmovaps xmmword ptr [rsp+0x260], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test rsi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp+0x240]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x244]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+ vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x200]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovaps ymm8, ymmword ptr [rsp+0x280]
+ vmovaps ymm0, ymmword ptr [rsp+0x240]
+ vmovups ymm1, ymmword ptr [rsp+0x248]
+ vmovaps ymm2, ymmword ptr [rsp+0x260]
+ vmovups ymm3, ymmword ptr [rsp+0x268]
+ vblendvps ymm0, ymm0, ymm1, ymm8
+ vblendvps ymm2, ymm2, ymm3, ymm8
+ vmovaps ymmword ptr [rsp+0x240], ymm0
+ vmovaps ymmword ptr [rsp+0x260], ymm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test rsi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm3, dword ptr [rsp+0x240]
+ vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm14, xmmword ptr [ROT16+rip]
+ vmovdqa xmm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa xmm3, xmm13
+ vpinsrd xmm3, xmm3, eax, 3
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+
+.p2align 6
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+ .long 8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+#endif /* HAVE_AVX2 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S
new file mode 100644
index 000000000..d02c5e7ec
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S
@@ -0,0 +1,2618 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_avx512
+.global zfs_blake3_compress_in_place_avx512
+.global zfs_blake3_compress_xof_avx512
+.text
+
+.type zfs_blake3_hash_many_avx512,@function
+.type zfs_blake3_compress_xof_avx512,@function
+.type zfs_blake3_compress_in_place_avx512,@function
+
+.p2align 6
+zfs_blake3_hash_many_avx512:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 144
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9
+ kmovw k1, r9d
+ vmovd xmm0, r8d
+ vpbroadcastd ymm0, xmm0
+ shr r8, 32
+ vmovd xmm1, r8d
+ vpbroadcastd ymm1, xmm1
+ vmovdqa ymm4, ymm1
+ vmovdqa ymm5, ymm1
+ vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
+ vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+ vpcmpltud k2, ymm2, ymm0
+ vpcmpltud k3, ymm3, ymm0
+ vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+ vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+ knotw k2, k1
+ vmovdqa32 ymm2 {k2}, ymm0
+ vmovdqa32 ymm3 {k2}, ymm0
+ vmovdqa32 ymm4 {k2}, ymm1
+ vmovdqa32 ymm5 {k2}, ymm1
+ vmovdqa ymmword ptr [rsp], ymm2
+ vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
+ vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
+ shl rdx, 6
+ mov qword ptr [rsp+0x80], rdx
+ cmp rsi, 16
+ jc 3f
+2:
+ vpbroadcastd zmm0, dword ptr [rcx]
+ vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+ vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+ vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+ vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+ vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+ vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+ vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm8, zmm16, zmm17
+ vpunpckhqdq zmm9, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm10, zmm18, zmm19
+ vpunpckhqdq zmm11, zmm18, zmm19
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm12, zmm16, zmm17
+ vpunpckhqdq zmm13, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm14, zmm18, zmm19
+ vpunpckhqdq zmm15, zmm18, zmm19
+ vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+ vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+ vshufps zmm16, zmm8, zmm10, 136
+ vshufps zmm17, zmm12, zmm14, 136
+ vmovdqa32 zmm20, zmm16
+ vpermt2d zmm16, zmm27, zmm17
+ vpermt2d zmm20, zmm31, zmm17
+ vshufps zmm17, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm21, zmm17
+ vpermt2d zmm17, zmm27, zmm30
+ vpermt2d zmm21, zmm31, zmm30
+ vshufps zmm18, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm22, zmm18
+ vpermt2d zmm18, zmm27, zmm8
+ vpermt2d zmm22, zmm31, zmm8
+ vshufps zmm19, zmm9, zmm11, 221
+ vshufps zmm8, zmm13, zmm15, 221
+ vmovdqa32 zmm23, zmm19
+ vpermt2d zmm19, zmm27, zmm8
+ vpermt2d zmm23, zmm31, zmm8
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm8, zmm24, zmm25
+ vpunpckhqdq zmm9, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm10, zmm24, zmm25
+ vpunpckhqdq zmm11, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm12, zmm24, zmm25
+ vpunpckhqdq zmm13, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm14, zmm24, zmm25
+ vpunpckhqdq zmm15, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vshufps zmm24, zmm8, zmm10, 136
+ vshufps zmm30, zmm12, zmm14, 136
+ vmovdqa32 zmm28, zmm24
+ vpermt2d zmm24, zmm27, zmm30
+ vpermt2d zmm28, zmm31, zmm30
+ vshufps zmm25, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm29, zmm25
+ vpermt2d zmm25, zmm27, zmm30
+ vpermt2d zmm29, zmm31, zmm30
+ vshufps zmm26, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm30, zmm26
+ vpermt2d zmm26, zmm27, zmm8
+ vpermt2d zmm30, zmm31, zmm8
+ vshufps zmm8, zmm9, zmm11, 221
+ vshufps zmm10, zmm13, zmm15, 221
+ vpermi2d zmm27, zmm8, zmm10
+ vpermi2d zmm31, zmm8, zmm10
+ vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa32 zmm12, zmmword ptr [rsp]
+ vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+ vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm24
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm23
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm27
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm21
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm28
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm26
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm22
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm31
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpxord zmm0, zmm0, zmm8
+ vpxord zmm1, zmm1, zmm9
+ vpxord zmm2, zmm2, zmm10
+ vpxord zmm3, zmm3, zmm11
+ vpxord zmm4, zmm4, zmm12
+ vpxord zmm5, zmm5, zmm13
+ vpxord zmm6, zmm6, zmm14
+ vpxord zmm7, zmm7, zmm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vpunpckldq zmm16, zmm0, zmm1
+ vpunpckhdq zmm17, zmm0, zmm1
+ vpunpckldq zmm18, zmm2, zmm3
+ vpunpckhdq zmm19, zmm2, zmm3
+ vpunpckldq zmm20, zmm4, zmm5
+ vpunpckhdq zmm21, zmm4, zmm5
+ vpunpckldq zmm22, zmm6, zmm7
+ vpunpckhdq zmm23, zmm6, zmm7
+ vpunpcklqdq zmm0, zmm16, zmm18
+ vpunpckhqdq zmm1, zmm16, zmm18
+ vpunpcklqdq zmm2, zmm17, zmm19
+ vpunpckhqdq zmm3, zmm17, zmm19
+ vpunpcklqdq zmm4, zmm20, zmm22
+ vpunpckhqdq zmm5, zmm20, zmm22
+ vpunpcklqdq zmm6, zmm21, zmm23
+ vpunpckhqdq zmm7, zmm21, zmm23
+ vshufi32x4 zmm16, zmm0, zmm4, 0x88
+ vshufi32x4 zmm17, zmm1, zmm5, 0x88
+ vshufi32x4 zmm18, zmm2, zmm6, 0x88
+ vshufi32x4 zmm19, zmm3, zmm7, 0x88
+ vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+ vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+ vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+ vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+ vshufi32x4 zmm0, zmm16, zmm17, 0x88
+ vshufi32x4 zmm1, zmm18, zmm19, 0x88
+ vshufi32x4 zmm2, zmm20, zmm21, 0x88
+ vshufi32x4 zmm3, zmm22, zmm23, 0x88
+ vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+ vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+ vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+ vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+ vmovdqu32 zmmword ptr [rbx], zmm0
+ vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+ vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+ vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+ vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+ vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+ vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+ vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+ vmovdqa32 zmm0, zmmword ptr [rsp]
+ vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+ vmovdqa32 zmm2, zmm0
+ vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+ vpcmpltud k2, zmm2, zmm0
+ vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+ vmovdqa32 zmmword ptr [rsp], zmm2
+ vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+ add rdi, 128
+ add rbx, 512
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 16
+ cmp rsi, 16
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 6
+3:
+ test esi, 0x8
+ je 3f
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+2:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm16, ymm12, ymm14, 136
+ vshufps ymm17, ymm12, ymm14, 221
+ vshufps ymm18, ymm13, ymm15, 136
+ vshufps ymm19, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm20, ymm12, ymm14, 136
+ vshufps ymm21, ymm12, ymm14, 221
+ vshufps ymm22, ymm13, ymm15, 136
+ vshufps ymm23, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm24, ymm12, ymm14, 136
+ vshufps ymm25, ymm12, ymm14, 221
+ vshufps ymm26, ymm13, ymm15, 136
+ vshufps ymm27, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm28, ymm12, ymm14, 136
+ vshufps ymm29, ymm12, ymm14, 221
+ vshufps ymm30, ymm13, ymm15, 136
+ vshufps ymm31, ymm13, ymm15, 221
+ vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa ymm12, ymmword ptr [rsp]
+ vmovdqa ymm13, ymmword ptr [rsp+0x40]
+ vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd ymm15, dword ptr [rsp+0x88]
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm24
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm23
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm27
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm21
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm28
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm26
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm22
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm31
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 2b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp]
+ vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
+ vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+ vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+ vmovdqa ymmword ptr [rsp], ymm0
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ add rdi, 64
+ sub rsi, 8
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x80]
+ movzx r13, byte ptr [rbp+0x38]
+ movzx r12, byte ptr [rbp+0x48]
+ test esi, 0x4
+ je 3f
+ vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+ vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+ vmovdqa xmm12, xmmword ptr [rsp]
+ vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
+ vpunpckldq xmm14, xmm12, xmm13
+ vpunpckhdq xmm15, xmm12, xmm13
+ vpermq ymm14, ymm14, 0xDC
+ vpermq ymm15, ymm15, 0xDC
+ vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vinserti32x8 zmm13, zmm14, ymm15, 0x01
+ mov eax, 17476
+ kmovw k2, eax
+ vpblendmd zmm13 {k2}, zmm13, zmm12
+ vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov eax, 43690
+ kmovw k3, eax
+ mov eax, 34952
+ kmovw k4, eax
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vmovdqa32 zmm2, zmm15
+ vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+ vpblendmd zmm3 {k4}, zmm13, zmm8
+ vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+ vshufps zmm4, zmm8, zmm9, 136
+ vshufps zmm5, zmm8, zmm9, 221
+ vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+ vshufps zmm6, zmm8, zmm9, 136
+ vshufps zmm7, zmm8, zmm9, 221
+ vpshufd zmm6, zmm6, 0x93
+ vpshufd zmm7, zmm7, 0x93
+ mov al, 7
+9:
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x93
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x39
+ vpaddd zmm0, zmm0, zmm6
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm7
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x39
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x93
+ dec al
+ jz 9f
+ vshufps zmm8, zmm4, zmm5, 214
+ vpshufd zmm9, zmm4, 0x0F
+ vpshufd zmm4, zmm8, 0x39
+ vshufps zmm8, zmm6, zmm7, 250
+ vpblendmd zmm9 {k3}, zmm9, zmm8
+ vpunpcklqdq zmm8, zmm7, zmm5
+ vpblendmd zmm8 {k4}, zmm8, zmm6
+ vpshufd zmm8, zmm8, 0x78
+ vpunpckhdq zmm5, zmm5, zmm7
+ vpunpckldq zmm6, zmm6, zmm5
+ vpshufd zmm7, zmm6, 0x1E
+ vmovdqa32 zmm5, zmm9
+ vmovdqa32 zmm6, zmm8
+ jmp 9b
+9:
+ vpxord zmm0, zmm0, zmm2
+ vpxord zmm1, zmm1, zmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+ vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x40]
+ vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+ vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x40], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test esi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x4]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x88]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
+ vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+ vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm14, dword ptr [rsp]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vpinsrd xmm3, xmm14, eax, 3
+ vmovdqa xmm2, xmm15
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+zfs_blake3_compress_in_place_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmmword ptr [rdi], xmm0
+ vmovdqu xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+zfs_blake3_compress_xof_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm2, xmm2, [rdi]
+ vpxor xmm3, xmm3, [rdi+0x10]
+ vmovdqu xmmword ptr [r9], xmm0
+ vmovdqu xmmword ptr [r9+0x10], xmm1
+ vmovdqu xmmword ptr [r9+0x20], xmm2
+ vmovdqu xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512
+.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512
+.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+
+.p2align 6
+INDEX0:
+ .long 0, 1, 2, 3, 16, 17, 18, 19
+ .long 8, 9, 10, 11, 24, 25, 26, 27
+INDEX1:
+ .long 4, 5, 6, 7, 20, 21, 22, 23
+ .long 12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ .long 8, 9, 10, 11, 12, 13, 14, 15
+ADD1: .long 1
+
+ADD16: .long 16
+BLAKE3_BLOCK_LEN:
+ .long 64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+ .long 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A
+
+#endif /* HAVE_AVX512 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S
new file mode 100644
index 000000000..39d23ee23
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S
@@ -0,0 +1,2323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_SSE2)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_sse2
+.global zfs_blake3_compress_in_place_sse2
+.global zfs_blake3_compress_xof_sse2
+
+.text
+.type zfs_blake3_hash_many_sse2,@function
+.type zfs_blake3_compress_in_place_sse2,@function
+.type zfs_blake3_compress_xof_sse2,@function
+
+ .p2align 6
+zfs_blake3_hash_many_sse2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movq xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ movdqa xmm13, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm12, xmm13
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl rax, 32
+ or rax, 64
+ movq xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+zfs_blake3_compress_in_place_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+zfs_blake3_compress_xof_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2
+.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2
+.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
+
+#endif /* HAVE_SSE2 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S
new file mode 100644
index 000000000..1c40236f0
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S
@@ -0,0 +1,2058 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_SSE4_1)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_compress_in_place_sse41
+.global zfs_blake3_compress_xof_sse41
+.global zfs_blake3_hash_many_sse41
+
+.text
+.type zfs_blake3_hash_many_sse41,@function
+.type zfs_blake3_compress_in_place_sse41,@function
+.type zfs_blake3_compress_xof_sse41,@function
+
+.p2align 6
+zfs_blake3_hash_many_sse41:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ pinsrd xmm3, eax, 3
+ pinsrd xmm11, eax, 3
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm12, xmmword ptr [ROT16+rip]
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm13, xmmword ptr [ROT8+rip]
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pblendw xmm13, xmm12, 0xCC
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ pblendw xmm12, xmm6, 0xC0
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pblendw xmm6, xmm5, 0xCC
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ pblendw xmm5, xmm14, 0xC0
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ movdqa xmm0, xmmword ptr [rsp+0x130]
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm2, xmmword ptr [rsp+0x120]
+ movdqu xmm3, xmmword ptr [rsp+0x118]
+ movdqu xmm4, xmmword ptr [rsp+0x128]
+ blendvps xmm1, xmm3, xmm0
+ blendvps xmm2, xmm4, xmm0
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm3, xmm13
+ pinsrd xmm3, eax, 3
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+zfs_blake3_compress_in_place_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+.p2align 6
+zfs_blake3_compress_xof_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41
+.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41
+.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+
+#endif /* HAVE_SSE4_1 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c
index f09389e6d..4df09884a 100644
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -696,16 +696,15 @@ zpool_feature_init(void)
ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures);
{
-
- static const spa_feature_t zilsaxattr_deps[] = {
- SPA_FEATURE_EXTENSIBLE_DATASET,
- SPA_FEATURE_NONE
- };
- zfeature_register(SPA_FEATURE_ZILSAXATTR,
- "org.openzfs:zilsaxattr", "zilsaxattr",
- "Support for xattr=sa extended attribute logging in ZIL.",
- ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT,
- ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures);
+ static const spa_feature_t zilsaxattr_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_ZILSAXATTR,
+ "org.openzfs:zilsaxattr", "zilsaxattr",
+ "Support for xattr=sa extended attribute logging in ZIL.",
+ ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT,
+ ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures);
}
zfeature_register(SPA_FEATURE_HEAD_ERRLOG,
@@ -714,6 +713,18 @@ zpool_feature_init(void)
ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL,
sfeatures);
+ {
+ static const spa_feature_t blake3_deps[] = {
+ SPA_FEATURE_EXTENSIBLE_DATASET,
+ SPA_FEATURE_NONE
+ };
+ zfeature_register(SPA_FEATURE_BLAKE3,
+ "org.openzfs:blake3", "blake3",
+ "BLAKE3 hash algorithm.",
+ ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN,
+ blake3_deps, sfeatures);
+ }
+
zfs_mod_list_supported_free(sfeatures);
}
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index 500d80a33..32475611e 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -84,6 +84,7 @@ zfs_prop_init(void)
{ "sha512", ZIO_CHECKSUM_SHA512 },
{ "skein", ZIO_CHECKSUM_SKEIN },
{ "edonr", ZIO_CHECKSUM_EDONR },
+ { "blake3", ZIO_CHECKSUM_BLAKE3 },
{ NULL }
};
@@ -102,6 +103,9 @@ zfs_prop_init(void)
ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
{ "edonr,verify",
ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
+ { "blake3", ZIO_CHECKSUM_BLAKE3 },
+ { "blake3,verify",
+ ZIO_CHECKSUM_BLAKE3 | ZIO_CHECKSUM_VERIFY },
{ NULL }
};
@@ -394,12 +398,12 @@ zfs_prop_init(void)
ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
ZFS_TYPE_VOLUME,
"on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein"
- " | edonr",
+ " | edonr | blake3",
"CHECKSUM", checksum_table, sfeatures);
zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"on | off | verify | sha256[,verify] | sha512[,verify] | "
- "skein[,verify] | edonr,verify",
+ "skein[,verify] | edonr,verify | blake3[,verify]",
"DEDUP", dedup_table, sfeatures);
zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
diff --git a/module/zfs/blake3_zfs.c b/module/zfs/blake3_zfs.c
new file mode 100644
index 000000000..51c455fe7
--- /dev/null
+++ b/module/zfs/blake3_zfs.c
@@ -0,0 +1,113 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+#include <sys/blake3.h>
+#include <sys/abd.h>
+
+static int
+blake3_incremental(void *buf, size_t size, void *arg)
+{
+ BLAKE3_CTX *ctx = arg;
+
+ Blake3_Update(ctx, buf, size);
+
+ return (0);
+}
+
+/*
+ * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using abd_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template,
+ zio_cksum_t *zcp)
+{
+ BLAKE3_CTX *ctx;
+
+ ctx = kmem_alloc(sizeof (*ctx), KM_NOSLEEP);
+ ASSERT(ctx != 0);
+ ASSERT(ctx_template != 0);
+
+ memcpy(ctx, ctx_template, sizeof (*ctx));
+ (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx);
+ Blake3_Final(ctx, (uint8_t *)zcp);
+
+ memset(ctx, 0, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
+
+/*
+ * Byteswapped version of abd_checksum_blake3_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * BLAKE3 is internally endian-insensitive).
+ */
+void
+abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ ASSERT(ctx_template != 0);
+
+ abd_checksum_blake3_native(abd, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt)
+{
+ BLAKE3_CTX *ctx;
+
+ ASSERT(sizeof (salt->zcs_bytes) == 32);
+
+ /* init reference object */
+ ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+ Blake3_InitKeyed(ctx, salt->zcs_bytes);
+
+ return (ctx);
+}
+
+/*
+ * Frees a BLAKE3 context template previously allocated using
+ * zio_checksum_blake3_tmpl_init.
+ */
+void
+abd_checksum_blake3_tmpl_free(void *ctx_template)
+{
+ BLAKE3_CTX *ctx = ctx_template;
+
+ memset(ctx, 0, sizeof (*ctx));
+ kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 12aec4a56..c57c69bd7 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -30,6 +30,7 @@
*/
#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
@@ -2417,6 +2418,7 @@ spa_init(spa_mode_t mode)
vdev_raidz_math_init();
vdev_file_init();
zfs_prop_init();
+ chksum_init();
zpool_prop_init();
zpool_feature_init();
spa_config_load();
@@ -2438,6 +2440,7 @@ spa_fini(void)
vdev_cache_stat_fini();
vdev_mirror_stat_fini();
vdev_raidz_math_fini();
+ chksum_fini();
zil_fini();
dmu_fini();
zio_fini();
diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c
new file mode 100644
index 000000000..3ebe08541
--- /dev/null
+++ b/module/zfs/zfs_chksum.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/types.h>
+#include <sys/spa.h>
+#include <sys/zio_checksum.h>
+#include <sys/zfs_context.h>
+#include <sys/zfs_chksum.h>
+
+#include <sys/blake3.h>
+
+static kstat_t *chksum_kstat = NULL;
+
+typedef struct {
+ const char *name;
+ const char *impl;
+ uint64_t bs1k;
+ uint64_t bs4k;
+ uint64_t bs16k;
+ uint64_t bs64k;
+ uint64_t bs256k;
+ uint64_t bs1m;
+ uint64_t bs4m;
+ zio_cksum_salt_t salt;
+ zio_checksum_t *(func);
+ zio_checksum_tmpl_init_t *(init);
+ zio_checksum_tmpl_free_t *(free);
+} chksum_stat_t;
+
+static int chksum_stat_cnt = 0;
+static chksum_stat_t *chksum_stat_data = 0;
+
+/*
+ * i3-1005G1 test output:
+ *
+ * implementation 1k 4k 16k 64k 256k 1m 4m
+ * fletcher-4 5421 15001 26468 32555 34720 32801 18847
+ * edonr-generic 1196 1602 1761 1749 1762 1759 1751
+ * skein-generic 546 591 608 615 619 612 616
+ * sha256-generic 246 270 274 274 277 275 276
+ * sha256-avx 262 296 304 307 307 307 306
+ * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228
+ * sha256-openssl 240 300 316 314 304 285 276
+ * sha512-generic 333 374 385 392 391 393 392
+ * sha512-openssl 353 441 467 476 472 467 426
+ * sha512-avx 362 444 473 475 479 476 478
+ * sha512-avx2 394 500 530 538 543 545 542
+ * blake3-generic 308 313 313 313 312 313 312
+ * blake3-sse2 402 1289 1423 1446 1432 1458 1413
+ * blake3-sse41 427 1470 1625 1704 1679 1607 1629
+ * blake3-avx2 428 1920 3095 3343 3356 3318 3204
+ * blake3-avx512 473 2687 4905 5836 5844 5643 5374
+ */
+static int
+chksum_stat_kstat_headers(char *buf, size_t size)
+{
+ ssize_t off = 0;
+
+ off += snprintf(buf + off, size, "%-23s", "implementation");
+ off += snprintf(buf + off, size - off, "%8s", "1k");
+ off += snprintf(buf + off, size - off, "%8s", "4k");
+ off += snprintf(buf + off, size - off, "%8s", "16k");
+ off += snprintf(buf + off, size - off, "%8s", "64k");
+ off += snprintf(buf + off, size - off, "%8s", "256k");
+ off += snprintf(buf + off, size - off, "%8s", "1m");
+ (void) snprintf(buf + off, size - off, "%8s\n", "4m");
+
+ return (0);
+}
+
+static int
+chksum_stat_kstat_data(char *buf, size_t size, void *data)
+{
+ chksum_stat_t *cs;
+ ssize_t off = 0;
+ char b[24];
+
+ cs = (chksum_stat_t *)data;
+ snprintf(b, 23, "%s-%s", cs->name, cs->impl);
+ off += snprintf(buf + off, size - off, "%-23s", b);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs1k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs4k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs16k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs64k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs256k);
+ off += snprintf(buf + off, size - off, "%8llu",
+ (u_longlong_t)cs->bs1m);
+ (void) snprintf(buf + off, size - off, "%8llu\n",
+ (u_longlong_t)cs->bs4m);
+
+ return (0);
+}
+
+static void *
+chksum_stat_kstat_addr(kstat_t *ksp, loff_t n)
+{
+ if (n < chksum_stat_cnt)
+ ksp->ks_private = (void *)(chksum_stat_data + n);
+ else
+ ksp->ks_private = NULL;
+
+ return (ksp->ks_private);
+}
+
+static void
+chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round,
+ uint64_t *result)
+{
+ hrtime_t start;
+ uint64_t run_bw, run_time_ns, run_count = 0, size = 0;
+ uint32_t l, loops = 0;
+ zio_cksum_t zcp;
+
+ switch (round) {
+ case 1: /* 1k */
+ size = 1<<10; loops = 128; break;
+ case 2: /* 2k */
+ size = 1<<12; loops = 64; break;
+ case 3: /* 4k */
+ size = 1<<14; loops = 32; break;
+ case 4: /* 16k */
+ size = 1<<16; loops = 16; break;
+ case 5: /* 256k */
+ size = 1<<18; loops = 8; break;
+ case 6: /* 1m */
+ size = 1<<20; loops = 4; break;
+ case 7: /* 4m */
+ size = 1<<22; loops = 1; break;
+ }
+
+ kpreempt_disable();
+ start = gethrtime();
+ do {
+ for (l = 0; l < loops; l++, run_count++)
+ cs->func(abd, size, ctx, &zcp);
+
+ run_time_ns = gethrtime() - start;
+ } while (run_time_ns < MSEC2NSEC(1));
+ kpreempt_enable();
+
+ run_bw = size * run_count * NANOSEC;
+ run_bw /= run_time_ns; /* B/s */
+ *result = run_bw/1024/1024; /* MiB/s */
+}
+
+static void
+chksum_benchit(chksum_stat_t *cs)
+{
+ abd_t *abd;
+ void *ctx = 0;
+ void *salt = &cs->salt.zcs_bytes;
+
+ /* allocate test memory via default abd interface */
+ abd = abd_alloc_linear(1<<22, B_FALSE);
+ memset(salt, 0, sizeof (cs->salt.zcs_bytes));
+ if (cs->init) {
+ ctx = cs->init(&cs->salt);
+ }
+
+ chksum_run(cs, abd, ctx, 1, &cs->bs1k);
+ chksum_run(cs, abd, ctx, 2, &cs->bs4k);
+ chksum_run(cs, abd, ctx, 3, &cs->bs16k);
+ chksum_run(cs, abd, ctx, 4, &cs->bs64k);
+ chksum_run(cs, abd, ctx, 5, &cs->bs256k);
+ chksum_run(cs, abd, ctx, 6, &cs->bs1m);
+ chksum_run(cs, abd, ctx, 7, &cs->bs4m);
+
+ /* free up temp memory */
+ if (cs->free) {
+ cs->free(ctx);
+ }
+ abd_free(abd);
+}
+
+/*
+ * Initialize and benchmark all supported implementations.
+ */
+static void
+chksum_benchmark(void)
+{
+
+#ifndef _KERNEL
+ /* we need the benchmark only for the kernel module */
+ return;
+#endif
+
+ chksum_stat_t *cs;
+ int cbid = 0, id;
+ uint64_t max = 0;
+
+ /* space for the benchmark times */
+ chksum_stat_cnt = 4;
+ chksum_stat_cnt += blake3_get_impl_count();
+ chksum_stat_data = (chksum_stat_t *)kmem_zalloc(
+ sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP);
+
+ /* edonr */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_edonr_tmpl_init;
+ cs->func = abd_checksum_edonr_native;
+ cs->free = abd_checksum_edonr_tmpl_free;
+ cs->name = "edonr";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* skein */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_skein_tmpl_init;
+ cs->func = abd_checksum_skein_native;
+ cs->free = abd_checksum_skein_tmpl_free;
+ cs->name = "skein";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* sha256 */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_SHA256;
+ cs->free = 0;
+ cs->name = "sha256";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* sha512 */
+ cs = &chksum_stat_data[cbid++];
+ cs->init = 0;
+ cs->func = abd_checksum_SHA512_native;
+ cs->free = 0;
+ cs->name = "sha512";
+ cs->impl = "generic";
+ chksum_benchit(cs);
+
+ /* blake3 */
+ for (id = 0; id < blake3_get_impl_count(); id++) {
+ blake3_set_impl_id(id);
+ cs = &chksum_stat_data[cbid++];
+ cs->init = abd_checksum_blake3_tmpl_init;
+ cs->func = abd_checksum_blake3_native;
+ cs->free = abd_checksum_blake3_tmpl_free;
+ cs->name = "blake3";
+ cs->impl = blake3_get_impl_name();
+ chksum_benchit(cs);
+ if (cs->bs256k > max) {
+ max = cs->bs256k;
+ blake3_set_impl_fastest(id);
+ }
+ }
+}
+
+void
+chksum_init(void)
+{
+
+ /* Benchmark supported implementations */
+ chksum_benchmark();
+
+ /* Install kstats for all implementations */
+ chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc",
+ KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+
+ if (chksum_kstat != NULL) {
+ chksum_kstat->ks_data = NULL;
+ chksum_kstat->ks_ndata = UINT32_MAX;
+ kstat_set_raw_ops(chksum_kstat,
+ chksum_stat_kstat_headers,
+ chksum_stat_kstat_data,
+ chksum_stat_kstat_addr);
+ kstat_install(chksum_kstat);
+ }
+
+ /* setup implementations */
+ blake3_setup_impl();
+}
+
+void
+chksum_fini(void)
+{
+ if (chksum_kstat != NULL) {
+ kstat_delete(chksum_kstat);
+ chksum_kstat = NULL;
+ }
+
+ if (chksum_stat_cnt) {
+ kmem_free(chksum_stat_data,
+ sizeof (chksum_stat_t) * chksum_stat_cnt);
+ chksum_stat_cnt = 0;
+ chksum_stat_data = 0;
+ }
+}
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index d89e57653..3c5cdf604 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free,
ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+ {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap},
+ abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"},
};
/*
@@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum)
VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
switch (cksum) {
+ case ZIO_CHECKSUM_BLAKE3:
+ return (SPA_FEATURE_BLAKE3);
case ZIO_CHECKSUM_SHA512:
return (SPA_FEATURE_SHA512);
case ZIO_CHECKSUM_SKEIN:
diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run
index 4ff46e7af..243221598 100644
--- a/tests/runfiles/common.run
+++ b/tests/runfiles/common.run
@@ -113,8 +113,8 @@ tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit',
tags = ['functional', 'channel_program', 'synctask_core']
[tests/functional/checksum]
-tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'filetest_001_pos',
- 'filetest_002_pos']
+tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test',
+ 'filetest_001_pos', 'filetest_002_pos']
tags = ['functional', 'checksum']
[tests/functional/clean_mirror]
diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore
index 1830cab76..20d138253 100644
--- a/tests/zfs-tests/cmd/.gitignore
+++ b/tests/zfs-tests/cmd/.gitignore
@@ -42,6 +42,7 @@
/ereports
/zfs_diff-socket
/dosmode_readonly_write
+/blake3_test
/edonr_test
/skein_test
/sha2_test
diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am
index e3c9874dc..3c8faf5af 100644
--- a/tests/zfs-tests/cmd/Makefile.am
+++ b/tests/zfs-tests/cmd/Makefile.am
@@ -98,15 +98,19 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/ereports
libzfs.la
-scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test %D%/sha2_test
+scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test \
+ %D%/sha2_test %D%/blake3_test
%C%_skein_test_SOURCES = %D%/checksum/skein_test.c
%C%_sha2_test_SOURCES = %D%/checksum/sha2_test.c
%C%_edonr_test_SOURCES = %D%/checksum/edonr_test.c
+%C%_blake3_test_SOURCES = %D%/checksum/blake3_test.c
%C%_skein_test_LDADD = \
libicp.la \
+ libspl.la \
libspl_assert.la
%C%_sha2_test_LDADD = $(%C%_skein_test_LDADD)
%C%_edonr_test_LDADD = $(%C%_skein_test_LDADD)
+%C%_blake3_test_LDADD = $(%C%_skein_test_LDADD)
if BUILD_LINUX
diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c
new file mode 100644
index 000000000..55d268f5f
--- /dev/null
+++ b/tests/zfs-tests/cmd/checksum/blake3_test.c
@@ -0,0 +1,575 @@
+
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <sys/blake3.h>
+
+/*
+ * set it to a define for debugging
+ */
+#undef BLAKE3_DEBUG
+
+/*
+ * C version of:
+ * https://github.com/BLAKE3-team/BLAKE3/tree/master/test_vectors
+ */
+typedef struct {
+ /* input length for this entry */
+ const int input_len;
+
+ /* hash value */
+ const char *hash;
+
+ /* salted hash value */
+ const char *shash;
+} blake3_test_t;
+
+/* BLAKE3 is variable here */
+#define TEST_DIGEST_LEN 262
+
+/*
+ * key for the keyed hashing
+ */
+static const char *salt = "whats the Elvish word for friend";
+
+static blake3_test_t TestArray[] = {
+ {
+ 0,
+ "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e0"
+ "0f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5"
+ "487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c2"
+ "2e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d",
+ "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b1"
+ "8171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73"
+ "cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be589"
+ "60856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f",
+ },
+ {
+ 1,
+ "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3"
+ "a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358a"
+ "d4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4"
+ "081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5",
+ "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b65"
+ "68c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0c"
+ "f7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f9"
+ "8fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11",
+ },
+ {
+ 2,
+ "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8"
+ "386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac"
+ "61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a43226"
+ "3a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1",
+ "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9f"
+ "fbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f65"
+ "8be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786"
+ "024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f",
+ },
+ {
+ 3,
+ "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b"
+ "49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454"
+ "b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cd"
+ "d0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134",
+ "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9"
+ "142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1"
+ "a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690"
+ "bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f",
+ },
+ {
+ 4,
+ "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f"
+ "603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b"
+ "56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e657972117"
+ "01dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12",
+ "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe07011"
+ "6c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf"
+ "81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf"
+ "0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a",
+ },
+ {
+ 5,
+ "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2eb"
+ "cfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2c"
+ "a748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c9999"
+ "04037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620",
+ "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616a"
+ "b199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b"
+ "38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0"
+ "d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218",
+ },
+ {
+ 6,
+ "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c84461"
+ "1a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435"
+ "d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac"
+ "1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a",
+ "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2"
+ "ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e880780084"
+ "2a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887"
+ "603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256",
+ },
+ {
+ 7,
+ "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66"
+ "036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a9"
+ "41f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fe"
+ "f1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c",
+ "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c"
+ "5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5f"
+ "d6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617"
+ "bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6",
+ },
+ {
+ 8,
+ "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb72"
+ "5d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f"
+ "9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a2"
+ "2e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c",
+ "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048"
+ "eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d"
+ "13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305ab"
+ "f86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276",
+ },
+ {
+ 63,
+ "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b11"
+ "97012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf46"
+ "87093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3"
+ "ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755",
+ "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea0"
+ "5a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847ab"
+ "b38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f116783"
+ "77483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d",
+ },
+ {
+ 64,
+ "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc"
+ "9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7f"
+ "bb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95"
+ "b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74",
+ "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e682"
+ "44c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f7"
+ "7a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c92"
+ "55306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb",
+ },
+ {
+ 65,
+ "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e"
+ "16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3"
+ "a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d"
+ "0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c",
+ "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5"
+ "e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc"
+ "5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b904149"
+ "7de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad",
+ },
+ {
+ 127,
+ "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3"
+ "137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da4"
+ "7644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc4435"
+ "5b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78",
+ "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee"
+ "7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd5"
+ "4663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d"
+ "135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc",
+ },
+ {
+ 128,
+ "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa6"
+ "9faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ec"
+ "ba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f7"
+ "5e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c",
+ "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd8"
+ "6bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50"
+ "edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad"
+ "92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5",
+ },
+ {
+ 129,
+ "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f9"
+ "6ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c71"
+ "27bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7"
+ "e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7",
+ "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c"
+ "9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aa"
+ "ee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412c"
+ "d8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683",
+ },
+ {
+ 1023,
+ "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a1"
+ "82d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56"
+ "778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b2"
+ "8f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485",
+ "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e89"
+ "0316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13e"
+ "fd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc"
+ "97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10",
+ },
+ {
+ 1024,
+ "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71c"
+ "f8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f"
+ "6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d91"
+ "7f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e",
+ "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a7"
+ "8bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a"
+ "8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b50002"
+ "36df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de",
+ },
+ {
+ 1025,
+ "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4"
+ "c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332"
+ "b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f9"
+ "55c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a",
+ "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea6936"
+ "2396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd535"
+ "2720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123"
+ "872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930",
+ },
+ {
+ 2048,
+ "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a"
+ "60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d0"
+ "63f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c6"
+ "7ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9",
+ "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd101"
+ "73b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b2"
+ "2f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef860"
+ "54f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe",
+ },
+ {
+ 2049,
+ "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096"
+ "de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae9"
+ "8764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d9042"
+ "5a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3",
+ "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9"
+ "a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81"
+ "447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c6464"
+ "99ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e",
+ },
+ {
+ 3072,
+ "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a"
+ "3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d12"
+ "0258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d15"
+ "99b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11",
+ "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022"
+ "f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c78"
+ "3a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b"
+ "996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b",
+ },
+ {
+ 3073,
+ "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a"
+ "27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b"
+ "639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd"
+ "66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf",
+ "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96"
+ "d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfdd"
+ "d6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea"
+ "2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5",
+ },
+ {
+ 4096,
+ "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e96902"
+ "89e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85"
+ "c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0"
+ "062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620",
+ "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bb"
+ "b64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c"
+ "757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb1"
+ "7d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de",
+ },
+ {
+ 4097,
+ "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505"
+ "f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd"
+ "26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61db"
+ "e091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956",
+ "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc60"
+ "6db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce"
+ "595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e9"
+ "00809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f",
+ },
+ {
+ 5120,
+ "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833ac"
+ "c61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a"
+ "0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9"
+ "321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059",
+ "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b"
+ "4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495"
+ "f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f"
+ "9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e",
+ },
+ {
+ 5121,
+ "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96"
+ "adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647"
+ "eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204"
+ "ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95",
+ "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d"
+ "07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c810"
+ "50b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092"
+ "133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d",
+ },
+ {
+ 6144,
+ "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d"
+ "742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade1"
+ "56c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c"
+ "6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83",
+ "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc3"
+ "5754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f79075"
+ "61f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486"
+ "b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e",
+ },
+ {
+ 6145,
+ "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18"
+ "a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb01501"
+ "5532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a"
+ "3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022",
+ "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3"
+ "c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283"
+ "ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f"
+ "372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c",
+ },
+ {
+ 7168,
+ "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a57"
+ "07c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165"
+ "b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de"
+ "4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95",
+ "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2"
+ "f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba34840098"
+ "9a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3e"
+ "aebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52",
+ },
+ {
+ 7169,
+ "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798"
+ "a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9"
+ "b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a735485228"
+ "40779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8",
+ "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c"
+ "9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabc"
+ "b438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc8566"
+ "17c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54",
+ },
+ {
+ 8192,
+ "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635f"
+ "e51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a4777"
+ "8566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62"
+ "712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf",
+ "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a48"
+ "34464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc"
+ "40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121"
+ "cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102",
+ },
+ {
+ 8193,
+ "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2"
+ "282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea6"
+ "0bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0"
+ "b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6",
+ "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f0"
+ "3228648fd983aef045c2fa8290934b0866b615f585149587dda229903996532883"
+ "5a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e0"
+ "9df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57",
+ },
+ {
+ 16384,
+ "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d"
+ "764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb3"
+ "9a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e475"
+ "03f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893",
+ "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9"
+ "e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960"
+ "ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725"
+ "581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65",
+ },
+ {
+ 31744,
+ "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c4786"
+ "0cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f"
+ "5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac97"
+ "8bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f",
+ "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a"
+ "7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628"
+ "be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c"
+ "2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec",
+ },
+ {
+ 102400,
+ "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e0"
+ "1c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f"
+ "0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c"
+ "009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e",
+ "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9"
+ "dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2"
+ "aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b"
+ "354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4",
+ },
+ {
+ 0, 0, 0
+ }
+};
+
+#ifdef BLAKE3_DEBUG
+#define dprintf printf
+#else
+#define dprintf(...)
+#endif
+
+static char fmt_tohex(char c);
+static size_t fmt_hexdump(char *dest, const char *src, size_t len);
+
+static char fmt_tohex(char c) {
+ return ((char)(c >= 10 ? c-10+'a' : c+'0'));
+}
+
+static size_t fmt_hexdump(char *dest, const char *src, size_t len) {
+ register const unsigned char *s = (const unsigned char *) src;
+ size_t written = 0, i;
+
+ if (!dest)
+ return ((len > ((size_t)-1)/2) ? (size_t)-1 : len*2);
+ for (i = 0; i < len; ++i) {
+ dest[written] = fmt_tohex(s[i]>>4);
+ dest[written+1] = fmt_tohex(s[i]&15);
+ written += 2;
+ }
+
+ return (written);
+}
+
+int
+main(int argc, char *argv[])
+{
+ boolean_t failed = B_FALSE;
+ uint8_t buffer[102400];
+ uint64_t cpu_mhz = 0;
+ int id, i, j;
+
+ if (argc == 2)
+ cpu_mhz = atoi(argv[1]);
+
+ /* fill test message */
+ for (i = 0, j = 0; i < sizeof (buffer); i++, j++) {
+ if (j == 251)
+ j = 0;
+ buffer[i] = (uint8_t)j;
+ }
+
+ (void) printf("Running algorithm correctness tests:\n");
+ for (id = 0; id < blake3_get_impl_count(); id++) {
+ blake3_set_impl_id(id);
+ const char *name = blake3_get_impl_name();
+ dprintf("Result for BLAKE3-%s:\n", name);
+ for (i = 0; TestArray[i].hash; i++) {
+ blake3_test_t *cur = &TestArray[i];
+
+ BLAKE3_CTX ctx;
+ uint8_t digest[TEST_DIGEST_LEN];
+ char result[TEST_DIGEST_LEN];
+
+ /* default hashing */
+ Blake3_Init(&ctx);
+ Blake3_Update(&ctx, buffer, cur->input_len);
+ Blake3_FinalSeek(&ctx, 0, digest, TEST_DIGEST_LEN);
+ fmt_hexdump(result, (char *)digest, 131);
+ if (memcmp(result, cur->hash, 131) != 0)
+ failed = B_TRUE;
+
+ dprintf("HASH-res: %s\n", result);
+ dprintf("HASH-ref: %s\n", cur->hash);
+
+ /* salted hashing */
+ Blake3_InitKeyed(&ctx, (const uint8_t *)salt);
+ Blake3_Update(&ctx, buffer, cur->input_len);
+ Blake3_FinalSeek(&ctx, 0, digest, TEST_DIGEST_LEN);
+ fmt_hexdump(result, (char *)digest, 131);
+ if (memcmp(result, cur->shash, 131) != 0)
+ failed = B_TRUE;
+
+ dprintf("SHASH-res: %s\n", result);
+ dprintf("SHASH-ref: %s\n", cur->shash);
+
+ printf("BLAKE3-%s Message (inlen=%d)\tResult: %s\n",
+ name, cur->input_len, failed?"FAILED!":"OK");
+ }
+ }
+
+ if (failed)
+ return (1);
+
+#define BLAKE3_PERF_TEST(impl, diglen) \
+ do { \
+ BLAKE3_CTX ctx; \
+ uint8_t digest[diglen / 8]; \
+ uint8_t block[131072]; \
+ uint64_t delta; \
+ double cpb = 0; \
+ int i; \
+ struct timeval start, end; \
+ memset(block, 0, sizeof (block)); \
+ (void) gettimeofday(&start, NULL); \
+ Blake3_Init(&ctx); \
+ for (i = 0; i < 8192; i++) \
+ Blake3_Update(&ctx, block, sizeof (block)); \
+ Blake3_Final(&ctx, digest); \
+ (void) gettimeofday(&end, NULL); \
+ delta = (end.tv_sec * 1000000llu + end.tv_usec) - \
+ (start.tv_sec * 1000000llu + start.tv_usec); \
+ if (cpu_mhz != 0) { \
+ cpb = (cpu_mhz * 1e6 * ((double)delta / \
+ 1000000)) / (8192 * 128 * 1024); \
+ } \
+ (void) printf("BLAKE3-%s %llu us (%.02f CPB)\n", impl, \
+ (u_longlong_t)delta, cpb); \
+ } while (0)
+
+ printf("Running performance tests (hashing 1024 MiB of data):\n");
+ for (id = 0; id < blake3_get_impl_count(); id++) {
+ blake3_set_impl_id(id);
+ const char *name = blake3_get_impl_name();
+ BLAKE3_PERF_TEST(name, 256);
+ }
+
+ return (0);
+}
diff --git a/tests/zfs-tests/cmd/checksum/edonr_test.c b/tests/zfs-tests/cmd/checksum/edonr_test.c
index c6365a414..3a0a48533 100644
--- a/tests/zfs-tests/cmd/checksum/edonr_test.c
+++ b/tests/zfs-tests/cmd/checksum/edonr_test.c
@@ -28,9 +28,6 @@
* gettimeofday due to -D_KERNEL (we can do this since we're actually
* running in userspace, but we need -D_KERNEL for the remaining Edon-R code).
*/
-#ifdef _KERNEL
-#undef _KERNEL
-#endif
#include <sys/edonr.h>
#include <stdlib.h>
diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c
index dc4173e10..bb3553110 100644
--- a/tests/zfs-tests/cmd/checksum/sha2_test.c
+++ b/tests/zfs-tests/cmd/checksum/sha2_test.c
@@ -28,9 +28,6 @@
* gettimeofday due to -D_KERNEL (we can do this since we're actually
* running in userspace, but we need -D_KERNEL for the remaining SHA2 code).
*/
-#ifdef _KERNEL
-#undef _KERNEL
-#endif
#include <stdarg.h>
#include <stdlib.h>
diff --git a/tests/zfs-tests/cmd/checksum/skein_test.c b/tests/zfs-tests/cmd/checksum/skein_test.c
index 99b47b453..13611c860 100644
--- a/tests/zfs-tests/cmd/checksum/skein_test.c
+++ b/tests/zfs-tests/cmd/checksum/skein_test.c
@@ -28,9 +28,6 @@
* gettimeofday due to -D_KERNEL (we can do this since we're actually
* running in userspace, but we need -D_KERNEL for the remaining Skein code).
*/
-#ifdef _KERNEL
-#undef _KERNEL
-#endif
#include <sys/skein.h>
#include <stdlib.h>
diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg
index 9dc2b4d0e..99430bc10 100644
--- a/tests/zfs-tests/include/commands.cfg
+++ b/tests/zfs-tests/include/commands.cfg
@@ -212,6 +212,7 @@ export ZFSTEST_FILES='badsend
zed_fd_spill-zedlet
suid_write_to_file
cp_files
+ blake3_test
edonr_test
skein_test
sha2_test
diff --git a/tests/zfs-tests/include/properties.shlib b/tests/zfs-tests/include/properties.shlib
index ba82f9620..14b3f4415 100644
--- a/tests/zfs-tests/include/properties.shlib
+++ b/tests/zfs-tests/include/properties.shlib
@@ -17,7 +17,7 @@
typeset -a compress_prop_vals=('off' 'lzjb' 'lz4' 'gzip' 'zle' 'zstd')
typeset -a checksum_prop_vals=('on' 'off' 'fletcher2' 'fletcher4' 'sha256'
- 'noparity' 'sha512' 'skein')
+ 'noparity' 'sha512' 'skein' 'blake3')
if ! is_freebsd; then
checksum_prop_vals+=('edonr')
fi
diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am
index a91a24d16..ffc087351 100644
--- a/tests/zfs-tests/tests/Makefile.am
+++ b/tests/zfs-tests/tests/Makefile.am
@@ -545,6 +545,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/checksum/cleanup.ksh \
functional/checksum/filetest_001_pos.ksh \
functional/checksum/filetest_002_pos.ksh \
+ functional/checksum/run_blake3_test.ksh \
functional/checksum/run_edonr_test.ksh \
functional/checksum/run_sha2_test.ksh \
functional/checksum/run_skein_test.ksh \
diff --git a/tests/zfs-tests/tests/functional/checksum/default.cfg b/tests/zfs-tests/tests/functional/checksum/default.cfg
index afb956093..a7e143e75 100644
--- a/tests/zfs-tests/tests/functional/checksum/default.cfg
+++ b/tests/zfs-tests/tests/functional/checksum/default.cfg
@@ -30,4 +30,4 @@
. $STF_SUITE/include/libtest.shlib
-set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr"
+set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "blake3" "sha256" "sha512" "skein" "edonr"
diff --git a/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh b/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh
new file mode 100755
index 000000000..cf1ca7032
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# Run the tests for the BLAKE3 hash algorithm.
+#
+
+log_assert "Run the tests for the BLAKE3 hash algorithm."
+
+freq=$(get_cpu_freq)
+log_must blake3_test $freq
+
+log_pass "BLAKE3 tests passed."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh
index 27003b21b..cab7c185e 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh
@@ -46,7 +46,7 @@
verify_runnable "both"
set -A dataset "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL/$TESTVOL"
-set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "noparity"
+set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "blake3" "noparity"
log_assert "Setting a valid checksum on a file system, volume," \
"it should be successful."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index 4ea5725e0..7849ed226 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -99,5 +99,6 @@ if is_linux || is_freebsd; then
"feature@zstd_compress"
"feature@zilsaxattr"
"feature@head_errlog"
+ "feature@blake3"
)
fi