diff options
53 files changed, 22804 insertions, 52 deletions
@@ -285,6 +285,7 @@ CONTRIBUTORS: Tim Connors <[email protected]> Tim Crawford <[email protected]> Tim Haley <[email protected]> + Tino Reichardt <[email protected]> Tobin Harding <[email protected]> Tom Caputi <[email protected]> Tom Matthews <[email protected]> diff --git a/cmd/ztest.c b/cmd/ztest.c index ca05cf265..95f6107ff 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -121,6 +121,7 @@ #include <sys/zfeature.h> #include <sys/dsl_userhold.h> #include <sys/abd.h> +#include <sys/blake3.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> @@ -417,6 +418,7 @@ ztest_func_t ztest_device_removal; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_trim; +ztest_func_t ztest_blake3; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; @@ -470,6 +472,7 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_trim, 1, &zopt_sometimes), + ZTI_INIT(ztest_blake3, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), @@ -6374,6 +6377,92 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) } void +ztest_blake3(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + hrtime_t end = gethrtime() + NANOSEC; + zio_cksum_salt_t salt; + void *salt_ptr = &salt.zcs_bytes; + struct abd *abd_data, *abd_meta; + void *buf, *templ; + int i, *ptr; + uint32_t size; + BLAKE3_CTX ctx; + + size = ztest_random_blocksize(); + buf = umem_alloc(size, UMEM_NOFAIL); + abd_data = abd_alloc(size, B_FALSE); + abd_meta = abd_alloc(size, B_TRUE); + + for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) + *ptr = ztest_random(UINT_MAX); + memset(salt_ptr, 'A', 32); + + abd_copy_from_buf_off(abd_data, buf, 0, size); + abd_copy_from_buf_off(abd_meta, buf, 0, size); + + while (gethrtime() <= end) { + int run_count = 100; + zio_cksum_t zc_ref1, zc_ref2; + zio_cksum_t zc_res1, zc_res2; + + void *ref1 = &zc_ref1; + void *ref2 = &zc_ref2; + void *res1 = &zc_res1; + void *res2 = &zc_res2; + + /* BLAKE3_KEY_LEN = 32 */ + VERIFY0(blake3_set_impl_name("generic")); + templ = abd_checksum_blake3_tmpl_init(&salt); + Blake3_InitKeyed(&ctx, salt_ptr); + Blake3_Update(&ctx, buf, size); + Blake3_Final(&ctx, ref1); + zc_ref2 = zc_ref1; + ZIO_CHECKSUM_BSWAP(&zc_ref2); + abd_checksum_blake3_tmpl_free(templ); + + VERIFY0(blake3_set_impl_name("cycle")); + while (run_count-- > 0) { + + /* Test current implementation */ + Blake3_InitKeyed(&ctx, salt_ptr); + Blake3_Update(&ctx, buf, size); + Blake3_Final(&ctx, res1); + zc_res2 = zc_res1; + ZIO_CHECKSUM_BSWAP(&zc_res2); + + VERIFY0(memcmp(ref1, res1, 32)); + VERIFY0(memcmp(ref2, res2, 32)); + + /* Test ABD - data */ + templ = abd_checksum_blake3_tmpl_init(&salt); + abd_checksum_blake3_native(abd_data, size, + templ, &zc_res1); + abd_checksum_blake3_byteswap(abd_data, size, + templ, &zc_res2); + + VERIFY0(memcmp(ref1, res1, 32)); + VERIFY0(memcmp(ref2, res2, 32)); + + /* Test ABD - metadata */ + abd_checksum_blake3_native(abd_meta, size, + templ, &zc_res1); + abd_checksum_blake3_byteswap(abd_meta, size, + templ, &zc_res2); + abd_checksum_blake3_tmpl_free(templ); + + VERIFY0(memcmp(ref1, res1, 32)); + VERIFY0(memcmp(ref2, res2, 32)); + + } + } + + abd_free(abd_data); + abd_free(abd_meta); + umem_free(buf, size); +} + +void ztest_fletcher(ztest_ds_t *zd, uint64_t id) { (void) zd, (void) id; diff --git a/config/always-arch.m4 b/config/always-arch.m4 index 02c8e4775..f7090a482 100644 --- a/config/always-arch.m4 +++ b/config/always-arch.m4 @@ -30,6 +30,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ ;; esac + AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64) AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc) + AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64) ]) diff --git a/include/Makefile.am b/include/Makefile.am index eee989d4a..1a7f67e9c 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -23,6 +23,7 @@ COMMON_H = \ sys/avl.h \ sys/avl_impl.h \ sys/bitops.h \ + sys/blake3.h \ sys/blkptr.h \ sys/bplist.h \ sys/bpobj.h \ @@ -117,6 +118,7 @@ COMMON_H = \ sys/zfeature.h \ sys/zfs_acl.h \ sys/zfs_bootenv.h \ + sys/zfs_chksum.h \ sys/zfs_context.h \ sys/zfs_debug.h \ sys/zfs_delay.h \ diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h index a46a3a18b..90b077a7b 100644 --- a/include/os/freebsd/spl/sys/ccompile.h +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -74,10 +74,12 @@ extern "C" { #ifndef LOCORE #ifndef HAVE_RPC_TYPES +#ifndef _KERNEL typedef int bool_t; typedef int enum_t; #endif #endif +#endif #ifndef __cplusplus #define __init diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h index 108cef22f..31e51ea20 100644 --- a/include/os/linux/kernel/linux/simd_powerpc.h +++ b/include/os/linux/kernel/linux/simd_powerpc.h @@ -57,25 +57,45 @@ #include <sys/types.h> #include <linux/version.h> -#define kfpu_allowed() 1 -#define kfpu_begin() \ - { \ - preempt_disable(); \ - enable_kernel_altivec(); \ - } +#define kfpu_allowed() 1 + #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) #define kfpu_end() \ { \ + disable_kernel_vsx(); \ disable_kernel_altivec(); \ preempt_enable(); \ } +#define kfpu_begin() \ + { \ + preempt_disable(); \ + enable_kernel_altivec(); \ + enable_kernel_vsx(); \ + } #else -/* seems that before 4.5 no-one bothered disabling ... */ +/* seems that before 4.5 no-one bothered */ +#define kfpu_begin() #define kfpu_end() preempt_enable() #endif #define kfpu_init() 0 #define kfpu_fini() ((void) 0) +static inline boolean_t +zfs_vsx_available(void) +{ + boolean_t res; +#if defined(__powerpc64__) + u64 msr; +#else + u32 msr; +#endif + kfpu_begin(); + __asm volatile("mfmsr %0" : "=r"(msr)); + res = (msr & 0x800000) != 0; + kfpu_end(); + return (res); +} + /* * Check if AltiVec instruction set is available */ diff --git a/include/sys/blake3.h b/include/sys/blake3.h new file mode 100644 index 000000000..e6650372c --- /dev/null +++ b/include/sys/blake3.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021 Tino Reichardt <[email protected]> + */ + +#ifndef BLAKE3_H +#define BLAKE3_H + +#ifdef _KERNEL +#include <sys/types.h> +#else +#include <stdint.h> +#include <stdlib.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_MAX_DEPTH 54 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 + +/* + * This struct is a private implementation detail. + * It has to be here because it's part of BLAKE3_CTX below. + */ +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state_t; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state_t chunk; + uint8_t cv_stack_len; + + /* + * The stack size is MAX_DEPTH + 1 because we do lazy merging. For + * example, with 7 chunks, we have 3 entries in the stack. Adding an + * 8th chunk requires a 4th entry, rather than merging everything down + * to 1, because we don't know whether more input is coming. This is + * different from how the reference implementation does things. + */ + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; + + /* const blake3_impl_ops_t *ops */ + const void *ops; +} BLAKE3_CTX; + +/* init the context for hash operation */ +void Blake3_Init(BLAKE3_CTX *ctx); + +/* init the context for a MAC and/or tree hash operation */ +void Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]); + +/* process the input bytes */ +void Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t input_len); + +/* finalize the hash computation and output the result */ +void Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out); + +/* finalize the hash computation and output the result */ +void Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, + size_t out_len); + +/* return number of supported implementations */ +extern int blake3_get_impl_count(void); + +/* return id of selected implementation */ +extern int blake3_get_impl_id(void); + +/* return name of selected implementation */ +extern const char *blake3_get_impl_name(void); + +/* setup id as fastest implementation */ +extern void blake3_set_impl_fastest(uint32_t id); + +/* set implementation by id */ +extern void blake3_set_impl_id(uint32_t id); + +/* set implementation by name */ +extern int blake3_set_impl_name(const char *name); + +/* set startup implementation */ +extern void blake3_setup_impl(void); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/include/sys/zfs_chksum.h b/include/sys/zfs_chksum.h new file mode 100644 index 000000000..cfd07bd0f --- /dev/null +++ b/include/sys/zfs_chksum.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021 Tino Reichardt <[email protected]> + */ + +#ifndef _ZFS_CHKSUM_H +#define _ZFS_CHKSUM_H + +#ifdef _KERNEL +#include <sys/types.h> +#else +#include <stdint.h> +#include <stdlib.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Benchmark the chksums of ZFS when the module is loading */ +void chksum_init(void); +void chksum_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CHKSUM_H */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 4fb15636e..945221796 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -124,6 +124,7 @@ typedef enum drr_headertype { * default use of "zfs send" won't encounter the bug mentioned above. */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) +#define DMU_BACKUP_FEATURE_BLAKE3 (1 << 28) /* * Mask of all supported backup features @@ -134,7 +135,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ - DMU_BACKUP_FEATURE_ZSTD) + DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_BLAKE3) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) diff --git a/include/sys/zio.h b/include/sys/zio.h index 7b78f0878..4b624165f 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -89,6 +89,7 @@ enum zio_checksum { ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, + ZIO_CHECKSUM_BLAKE3, ZIO_CHECKSUM_FUNCTIONS }; diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 9a73a6262..a2ce50816 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -21,7 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright Saso Kiselkov 2013, All rights reserved. + * Copyright (c) 2013 Saso Kiselkov, All rights reserved. + * Copyright (c) 2021 Tino Reichardt <[email protected]> */ #ifndef _SYS_ZIO_CHECKSUM_H @@ -107,6 +108,8 @@ _SYS_ZIO_CHECKSUM_H zio_checksum_info_t /* * Checksum routines. */ + +/* SHA2 */ extern zio_checksum_t abd_checksum_SHA256; extern zio_checksum_t abd_checksum_SHA512_native; extern zio_checksum_t abd_checksum_SHA512_byteswap; @@ -123,6 +126,13 @@ extern zio_checksum_t abd_checksum_edonr_byteswap; extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; +/* BLAKE3 */ +extern zio_checksum_t abd_checksum_blake3_native; +extern zio_checksum_t abd_checksum_blake3_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_blake3_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_blake3_tmpl_free; + +/* Fletcher 4 */ _SYS_ZIO_CHECKSUM_H zio_abd_checksum_func_t fletcher_4_abd_ops; extern zio_checksum_t abd_fletcher_4_native; extern zio_checksum_t abd_fletcher_4_byteswap; diff --git a/include/zfeature_common.h b/include/zfeature_common.h index d4d636f9c..d98345fe6 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -77,6 +77,7 @@ typedef enum spa_feature { SPA_FEATURE_DRAID, SPA_FEATURE_ZILSAXATTR, SPA_FEATURE_HEAD_ERRLOG, + SPA_FEATURE_BLAKE3, SPA_FEATURES } spa_feature_t; diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index 304f49e39..b7f1d0e1b 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -13,6 +13,10 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/aes/aes_impl_x86-64.c \ module/icp/algs/aes/aes_impl.c \ module/icp/algs/aes/aes_modes.c \ + module/icp/algs/blake3/blake3.c \ + module/icp/algs/blake3/blake3_generic.c \ + module/icp/algs/blake3/blake3_impl.c \ + module/icp/algs/blake3/blake3_x86-64.c \ module/icp/algs/edonr/edonr.c \ module/icp/algs/modes/modes.c \ module/icp/algs/modes/cbc.c \ @@ -36,15 +40,30 @@ nodist_libicp_la_SOURCES = \ module/icp/core/kcf_mech_tabs.c \ module/icp/core/kcf_prov_tabs.c -if TARGET_CPU_X86_64 +if TARGET_CPU_AARCH64 +nodist_libicp_la_SOURCES += \ + module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \ + module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +endif + +if TARGET_CPU_POWERPC nodist_libicp_la_SOURCES += \ - module/icp/asm-x86_64/aes/aeskey.c + module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \ + module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S +endif + +if TARGET_CPU_X86_64 nodist_libicp_la_SOURCES += \ + module/icp/asm-x86_64/aes/aeskey.c \ module/icp/asm-x86_64/aes/aes_amd64.S \ module/icp/asm-x86_64/aes/aes_aesni.S \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ module/icp/asm-x86_64/sha2/sha256_impl.S \ - module/icp/asm-x86_64/sha2/sha512_impl.S + module/icp/asm-x86_64/sha2/sha512_impl.S \ + module/icp/asm-x86_64/blake3/blake3_avx2.S \ + module/icp/asm-x86_64/blake3/blake3_avx512.S \ + module/icp/asm-x86_64/blake3/blake3_sse2.S \ + module/icp/asm-x86_64/blake3/blake3_sse41.S endif diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h index 6ef836c16..6a6d8b7c6 100644 --- a/lib/libspl/include/sys/simd.h +++ b/lib/libspl/include/sys/simd.h @@ -491,6 +491,24 @@ zfs_altivec_available(void) #endif return (has_altivec); } +static inline boolean_t +zfs_vsx_available(void) +{ + boolean_t has_vsx = B_FALSE; +#if defined(__ALTIVEC__) && !defined(__FreeBSD__) + sighandler_t savesig; + savesig = signal(SIGILL, sigillhandler); + if (setjmp(env)) { + signal(SIGILL, savesig); + has_vsx = B_FALSE; + } else { + __asm__ __volatile__("xssubsp 0,0,0\n"); + signal(SIGILL, savesig); + has_vsx = B_TRUE; + } +#endif + return (has_vsx); +} #else #define kfpu_allowed() 0 diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 8a71da951..9f9a2f907 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -583,7 +583,7 @@ <elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> - <elf-symbol name='spa_feature_table' size='2016' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> + <elf-symbol name='spa_feature_table' size='2072' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/> @@ -4770,8 +4770,8 @@ </function-decl> </abi-instr> <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'> - <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='16128' id='9d5e9e2e'> - <subrange length='36' type-id='7359adad' id='ae666bde'/> + <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='16576' id='9d5e9e2e'> + <subrange length='37' type-id='7359adad' id='ae666bde'/> </array-type-def> <enum-decl name='spa_feature' id='33ecb627'> <underlying-type type-id='9cac1fee'/> @@ -4812,7 +4812,8 @@ <enumerator name='SPA_FEATURE_DRAID' value='33'/> <enumerator name='SPA_FEATURE_ZILSAXATTR' value='34'/> <enumerator name='SPA_FEATURE_HEAD_ERRLOG' value='35'/> - <enumerator name='SPA_FEATURES' value='36'/> + <enumerator name='SPA_FEATURE_BLAKE3' value='36'/> + <enumerator name='SPA_FEATURES' value='37'/> </enum-decl> <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/> <enum-decl name='zfeature_flags' id='6db816a4'> diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 60eb30749..eaa920e56 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -67,6 +67,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/abd.c \ module/zfs/aggsum.c \ module/zfs/arc.c \ + module/zfs/blake3_zfs.c \ module/zfs/blkptr.c \ module/zfs/bplist.c \ module/zfs/bpobj.c \ @@ -171,6 +172,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zcp_synctask.c \ module/zfs/zfeature.c \ module/zfs/zfs_byteswap.c \ + module/zfs/zfs_chksum.c \ module/zfs/zfs_fm.c \ module/zfs/zfs_fuid.c \ module/zfs/zfs_ratelimit.c \ diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 2694938aa..b1e1ce377 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -743,7 +743,7 @@ This property is not inherited. .It Xo .Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns .Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns -.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr +.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr Ns | Ns Sy blake3 .Xc Controls the checksum used to verify data integrity. The default value is @@ -768,8 +768,9 @@ a recommended practice. The .Sy sha512 , .Sy skein , +.Sy edonr , and -.Sy edonr +.Sy blake3 checksum algorithms require enabling the appropriate features on the pool. .Pp Please see @@ -984,7 +985,7 @@ mount options. .It Xo .Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns .Sy sha256 Ns Oo , Ns Sy verify Oc Ns | Ns Sy sha512 Ns Oo , Ns Sy verify Oc Ns | Ns Sy skein Ns Oo , Ns Sy verify Oc Ns | Ns -.Sy edonr , Ns Sy verify +.Sy edonr , Ns Sy verify Ns | Ns Sy blake3 Ns Oo , Ns Sy verify Oc Ns .Xc Configures deduplication for a dataset. The default value is diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b92109c4a..df9e64701 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -326,6 +326,12 @@ while .Sy freeing is non-zero. . +.feature org.openzfs blake3 no extensible_dataset +This feature enables the use of the BLAKE3 hash algorithm for checksum and dedup. +BLAKE3 is a secure hash algorithm focused on high performance. +.Pp +.checksum-spiel blake3 +. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark @@ -436,6 +442,8 @@ in ZFS, which means that the checksum is pre-seeded with a secret to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. +.Pp +.checksum-spiel edonr . .feature com.delphix embedded_data no This feature improves the performance and compression ratio of diff --git a/module/Kbuild.in b/module/Kbuild.in index 11099999f..ed8dc23a9 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -75,6 +75,10 @@ ICP_OBJS := \ algs/aes/aes_impl.o \ algs/aes/aes_impl_generic.o \ algs/aes/aes_modes.o \ + algs/blake3/blake3.o \ + algs/blake3/blake3_generic.o \ + algs/blake3/blake3_impl.o \ + algs/blake3/blake3_x86-64.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ @@ -105,23 +109,44 @@ ICP_OBJS_X86_64 := \ asm-x86_64/aes/aes_aesni.o \ asm-x86_64/aes/aes_amd64.o \ asm-x86_64/aes/aeskey.o \ + asm-x86_64/blake3/blake3_avx2.o \ + asm-x86_64/blake3/blake3_avx512.o \ + asm-x86_64/blake3/blake3_sse2.o \ + asm-x86_64/blake3/blake3_sse41.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o \ asm-x86_64/sha2/sha256_impl.o \ asm-x86_64/sha2/sha512_impl.o + ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o + +ICP_OBJS_ARM64 := \ + asm-aarch64/blake3/b3_aarch64_sse2.o \ + asm-aarch64/blake3/b3_aarch64_sse41.o + + +ICP_OBJS_PPC_PPC64 := \ + asm-ppc64/blake3/b3_ppc64le_sse2.o \ + asm-ppc64/blake3/b3_ppc64le_sse41.o + zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) +zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64)) +zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) +zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) + +$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ + $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) -$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : asflags-y += -I$(icp_include) -$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflags-y += -I$(icp_include) +$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ + $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) # Suppress objtool "can't find jump dest instruction at" warnings. They # are caused by the constants which are defined in the text section of the @@ -129,6 +154,7 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflag # utility tries to interpret them as opcodes and obviously fails doing so. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y + # Suppress objtool "unsupported stack pointer realignment" warnings. We are # not using a DRAP register while aligning the stack to a 64 byte boundary. # See #6950 for the reasoning. @@ -261,6 +287,7 @@ ZFS_OBJS := \ abd.o \ aggsum.o \ arc.o \ + blake3_zfs.o \ blkptr.o \ bplist.o \ bpobj.o \ @@ -358,6 +385,7 @@ ZFS_OBJS := \ zcp_synctask.o \ zfeature.o \ zfs_byteswap.o \ + zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ zfs_ioctl.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 61f02152d..589ca60b2 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -10,6 +10,10 @@ INCDIR=${.CURDIR:H}/include KMOD= openzfs .PATH: ${SRCDIR}/avl \ + ${SRCDIR}/icp/algs/blake3 \ + ${SRCDIR}/icp/asm-aarch64/blake3 \ + ${SRCDIR}/icp/asm-ppc64/blake3 \ + ${SRCDIR}/icp/asm-x86_64/blake3 \ ${SRCDIR}/lua \ ${SRCDIR}/nvpair \ ${SRCDIR}/icp/algs/edonr \ @@ -31,6 +35,7 @@ CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs CFLAGS+= -I${SRCDIR}/zstd/include +CFLAGS+= -I${SRCDIR}/icp/include CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ @@ -38,7 +43,8 @@ CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DCOMPAT_FREEBSD11 .if ${MACHINE_ARCH} == "amd64" -CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3 +CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \ + -DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL .endif .if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" @@ -73,12 +79,32 @@ CFLAGS+= -DBITS_PER_LONG=64 SRCS= vnode_if.h device_if.h bus_if.h -# avl +#avl SRCS+= avl.c # icp SRCS+= edonr.c +#icp/algs/blake3 +SRCS+= blake3.c \ + blake3_generic.c \ + blake3_impl.c \ + blake3_x86-64.c + +#icp/asm-aarch64/blake3 +SRCS+= b3_aarch64_sse2.S \ + b3_aarch64_sse41.S + +#icp/asm-ppc64/blake3 +SRCS+= b3_ppc64le_sse2.S \ + b3_ppc64le_sse41.S + +#icp/asm-x86_64/blake3 +SRCS+= blake3_avx2.S \ + blake3_avx512.S \ + blake3_sse2.S \ + blake3_sse41.S + #lua SRCS+= lapi.c \ lauxlib.c \ @@ -189,6 +215,7 @@ SRCS+= zfeature_common.c \ SRCS+= abd.c \ aggsum.c \ arc.c \ + blake3_zfs.c \ blkptr.c \ bplist.c \ bpobj.c \ @@ -291,6 +318,7 @@ SRCS+= abd.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ + zfs_chksum.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ @@ -337,8 +365,6 @@ SRCS+= zfs_zstd.c \ zstd_decompress.c \ zstd_decompress_block.c - - beforeinstall: .if ${MK_DEBUG_FILES} != "no" mtree -eu \ diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c new file mode 100644 index 000000000..8c9c06eb9 --- /dev/null +++ b/module/icp/algs/blake3/blake3.c @@ -0,0 +1,732 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include <sys/blake3.h> + +#include "blake3_impl.h" + +/* + * We need 1056 byte stack for blake3_compress_subtree_wide() + * - we define this pragma to make gcc happy + */ +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +/* internal used */ +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +/* internal flags */ +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +/* internal start */ +static void chunk_state_init(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint8_t flags) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; + ctx->blocks_compressed = 0; + ctx->flags = flags; +} + +static void chunk_state_reset(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint64_t chunk_counter) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = chunk_counter; + ctx->blocks_compressed = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; +} + +static size_t chunk_state_len(const blake3_chunk_state_t *ctx) +{ + return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) + + ((size_t)ctx->buf_len); +} + +static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx, + const uint8_t *input, size_t input_len) +{ + size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len); + memcpy(dest, input, take); + ctx->buf_len += (uint8_t)take; + return (take); +} + +static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx) +{ + if (ctx->blocks_compressed == 0) { + return (CHUNK_START); + } else { + return (0); + } +} + +static output_t make_output(const uint32_t input_cv[8], + const uint8_t *block, uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return (ret); +} + +/* + * Chaining values within a given chunk (specifically the compress_in_place + * interface) are represented as words. This avoids unnecessary bytes<->words + * conversion overhead in the portable implementation. However, the hash_many + * interface handles both user input and parent node blocks, so it accepts + * bytes. For that reason, chaining values in the CV stack are represented as + * bytes. + */ +static void output_chaining_value(const blake3_impl_ops_t *ops, + const output_t *ctx, uint8_t cv[32]) +{ + uint32_t cv_words[8]; + memcpy(cv_words, ctx->input_cv, 32); + ops->compress_in_place(cv_words, ctx->block, ctx->block_len, + ctx->counter, ctx->flags); + store_cv_words(cv, cv_words); +} + +static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, + uint64_t seek, uint8_t *out, size_t out_len) +{ + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len, + output_block_counter, ctx->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +static void chunk_state_update(const blake3_impl_ops_t *ops, + blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) +{ + if (ctx->buf_len > 0) { + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + ops->compress_in_place(ctx->cv, ctx->buf, + BLAKE3_BLOCK_LEN, ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + ctx->buf_len = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN, + ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; +} + +static output_t chunk_state_output(const blake3_chunk_state_t *ctx) +{ + uint8_t block_flags = + ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END; + return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter, + block_flags)); +} + +static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) +{ + return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT)); +} + +/* + * Given some input larger than one chunk, return the number of bytes that + * should go in the left subtree. This is the largest power-of-2 number of + * chunks that leaves at least 1 byte for the right subtree. + */ +static size_t left_len(size_t content_len) +{ + /* + * Subtract 1 to reserve at least one byte for the right side. + * content_len + * should always be greater than BLAKE3_CHUNK_LEN. + */ + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN); +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time + * on a single thread. Write out the chunk chaining values and return the + * number of chunks hashed. These chunks are never the root and never empty; + * those cases use a different codepath. + */ +static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / + BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START, + CHUNK_END, out); + + /* + * Hash the remaining partial chunk, if there is one. Note that the + * empty chunk (meaning the empty message) is a different codepath. + */ + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(ops, &chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(ops, &output, &out[chunks_array_len * + BLAKE3_OUT_LEN]); + return (chunks_array_len + 1); + } else { + return (chunks_array_len); + } +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time + * on a single thread. Write out the parent chaining values and return the + * number of parents hashed. (If there's an odd input chaining value left over, + * return it as an additional output.) These parents are never the root and + * never empty; those cases use a different codepath. + */ +static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, + const uint8_t *child_chaining_values, size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, uint8_t *out) +{ + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = &child_chaining_values[2 * + parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE, + flags | PARENT, 0, 0, out); + + /* If there's an odd child left over, it becomes an output. */ + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * + BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); + return (parents_array_len + 1); + } else { + return (parents_array_len); + } +} + +/* + * The wide helper function returns (writes out) an array of chaining values + * and returns the length of that array. The number of chaining values returned + * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, + * if the input is shorter than that many chunks. The reason for maintaining a + * wide array of chaining values going back up the tree, is to allow the + * implementation to hash as many parents in parallel as possible. + * + * As a special case when the SIMD degree is 1, this function will still return + * at least 2 outputs. This guarantees that this function doesn't perform the + * root compression. (If it did, it would use the wrong flags, and also we + * wouldn't be able to implement exendable ouput.) Note that this function is + * not used when the whole input is only 1 chunk long; that's a different + * codepath. + * + * Why not just have the caller split the input on the first update(), instead + * of implementing this special rule? Because we don't want to limit SIMD or + * multi-threading parallelism for that update(). + */ +static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + /* + * Note that the single chunk case does *not* bump the SIMD degree up + * to 2 when it is 1. If this implementation adds multi-threading in + * the future, this gives us the option of multi-threading even the + * 2-chunk case, which can help performance on smaller platforms. + */ + if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) { + return (compress_chunks_parallel(ops, input, input_len, key, + chunk_counter, flags, out)); + } + + + /* + * With more than simd_degree chunks, we need to recurse. Start by + * dividing the input into left and right subtrees. (Note that this is + * only optimal as long as the SIMD degree is a power of 2. If we ever + * get a SIMD degree of 3 or something, we'll need a more complicated + * strategy.) + */ + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = chunk_counter + + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + /* + * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 + * to account for the special case of returning 2 outputs when the + * SIMD degree is 1. + */ + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = ops->degree; + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + + /* + * The special case: We always use a degree of at least two, + * to make sure there are two outputs. Except, as noted above, + * at the chunk level, where we allow degree=1. (Note that the + * 1-chunk-input case is a different codepath.) + */ + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + /* + * Recurse! If this implementation adds multi-threading support in the + * future, this is where it will go. + */ + size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len, + key, chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide(ops, right_input, + right_input_len, key, right_chunk_counter, flags, right_cvs); + + /* + * The special case again. If simd_degree=1, then we'll have left_n=1 + * and right_n=1. Rather than compressing them into a single output, + * return them directly, to make sure we always have at least two + * outputs. + */ + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return (2); + } + + /* Otherwise, do one layer of parent node compression. */ + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(ops, cv_array, + num_chaining_values, key, flags, out); +} + +/* + * Hash a subtree with compress_subtree_wide(), and then condense the resulting + * list of chaining values down to a single parent node. Don't compress that + * last parent node, however. Instead, return its message bytes (the + * concatenated chaining values of its children). This is necessary when the + * first call to update() supplies a complete subtree, because the topmost + * parent node of that subtree could end up being the root. It's also necessary + * for extended output in the general case. + * + * As with compress_subtree_wide(), this function is not used on inputs of 1 + * chunk or less. That's a different codepath. + */ +static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len, + key, chunk_counter, flags, cv_array); + + /* + * If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + * compress_subtree_wide() returns more than 2 chaining values. Condense + * them into 2 by forming parent nodes repeatedly. + */ + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key, + flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], + uint8_t flags) +{ + memcpy(ctx->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&ctx->chunk, key, flags); + ctx->cv_stack_len = 0; + ctx->ops = blake3_impl_get_ops(); +} + +/* + * As described in hasher_push_cv() below, we do "lazy merging", delaying + * merges until right before the next CV is about to be added. This is + * different from the reference implementation. Another difference is that we + * aren't always merging 1 chunk at a time. Instead, each CV might represent + * any power-of-two number of chunks, as long as the smaller-above-larger + * stack order is maintained. Instead of the "count the trailing 0-bits" + * algorithm described in the spec, we use a "count the total number of + * 1-bits" variant that doesn't require us to retain the subtree size of the + * CV on top of the stack. The principle is the same: each CV that should + * remain in the stack is represented by a 1-bit in the total number of chunks + * (or bytes) so far. + */ +static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len) +{ + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (ctx->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = + parent_output(parent_node, ctx->key, ctx->chunk.flags); + output_chaining_value(ctx->ops, &output, parent_node); + ctx->cv_stack_len -= 1; + } +} + +/* + * In reference_impl.rs, we merge the new CV with existing CVs from the stack + * before pushing it. We can do that because we know more input is coming, so + * we know none of the merges are root. + * + * This setting is different. We want to feed as much input as possible to + * compress_subtree_wide(), without setting aside anything for the chunk_state. + * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once + * as a single subtree, if at all possible. + * + * This leads to two problems: + * 1) This 64 KiB input might be the only call that ever gets made to update. + * In this case, the root node of the 64 KiB subtree would be the root node + * of the whole tree, and it would need to be ROOT finalized. We can't + * compress it until we know. + * 2) This 64 KiB input might complete a larger tree, whose root node is + * similarly going to be the the root of the whole tree. For example, maybe + * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the + * node at the root of the 256 KiB subtree until we know how to finalize it. + * + * The second problem is solved with "lazy merging". That is, when we're about + * to add a CV to the stack, we don't merge it with anything first, as the + * reference impl does. Instead we do merges using the *previous* CV that was + * added, which is sitting on top of the stack, and we put the new CV + * (unmerged) on top of the stack afterwards. This guarantees that we never + * merge the root node until finalize(). + * + * Solving the first problem requires an additional tool, + * compress_subtree_to_parent_node(). That function always returns the top + * *two* chaining values of the subtree it's compressing. We then do lazy + * merging with each of them separately, so that the second CV will always + * remain unmerged. (That also helps us support extendable output when we're + * hashing an input all-at-once.) + */ +static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) +{ + hasher_merge_cv_stack(ctx, chunk_counter); + memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + ctx->cv_stack_len += 1; +} + +void +Blake3_Init(BLAKE3_CTX *ctx) +{ + hasher_init_base(ctx, BLAKE3_IV, 0); +} + +void +Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]) +{ + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(ctx, key_words, KEYED_HASH); +} + +static void +Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector<uint8_t> v; + * blake3_hasher_update(&hasher, v.data(), v.size()); + */ + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + /* + * If we have some partial chunk bytes in the internal chunk_state, we + * need to finish that chunk first. + */ + if (chunk_state_len(&ctx->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + /* + * If we've filled the current chunk and there's more coming, + * finalize this chunk and proceed. In this case we know it's + * not the root. + */ + if (input_len > 0) { + output_t output = chunk_state_output(&ctx->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(ctx->ops, &output, chunk_cv); + hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter); + chunk_state_reset(&ctx->chunk, ctx->key, + ctx->chunk.chunk_counter + 1); + } else { + return; + } + } + + /* + * Now the chunk_state is clear, and we have more input. If there's + * more than a single chunk (so, definitely not the root chunk), hash + * the largest whole subtree we can, with the full benefits of SIMD + * (and maybe in the future, multi-threading) parallelism. Two + * restrictions: + * - The subtree has to be a power-of-2 number of chunks. Only + * subtrees along the right edge can be incomplete, and we don't know + * where the right edge is going to be until we get to finalize(). + * - The subtree must evenly divide the total number of chunks up + * until this point (if total is not 0). If the current incomplete + * subtree is only waiting for 1 more chunk, we can't hash a subtree + * of 4 chunks. We have to complete the current subtree first. + * Because we might need to break up the input to form powers of 2, or + * to evenly divide what we already have, this part runs in a loop. + */ + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = + ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + /* + * Shrink the subtree_len until it evenly divides the count so + * far. We know that subtree_len itself is a power of 2, so we + * can use a bitmasking trick instead of an actual remainder + * operation. (Note that if the caller consistently passes + * power-of-2 inputs of the same size, as is hopefully + * typical, this loop condition will always fail, and + * subtree_len will always be the full length of the input.) + * + * An aside: We don't have to shrink subtree_len quite this + * much. For example, if count_so_far is 1, we could pass 2 + * chunks to compress_subtree_to_parent_node. Since we'll get + * 2 CVs back, we'll still get the right answer in the end, + * and we might get to use 2-way SIMD parallelism. The problem + * with this optimization, is that it gets us stuck always + * hashing 2 chunks. The total number of chunks will remain + * odd, and we'll never graduate to higher degrees of + * parallelism. See + * https://github.com/BLAKE3-team/BLAKE3/issues/69. + */ + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + /* + * The shrunken subtree_len might now be 1 chunk long. If so, + * hash that one chunk by itself. Otherwise, compress the + * subtree into a pair of CVs. + */ + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, ctx->key, + ctx->chunk.flags); + chunk_state.chunk_counter = ctx->chunk.chunk_counter; + chunk_state_update(ctx->ops, &chunk_state, input_bytes, + subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(ctx->ops, &output, cv); + hasher_push_cv(ctx, cv, chunk_state.chunk_counter); + } else { + /* + * This is the high-performance happy path, though + * getting here depends on the caller giving us a long + * enough input. + */ + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(ctx->ops, input_bytes, + subtree_len, ctx->key, ctx-> chunk.chunk_counter, + ctx->chunk.flags, cv_pair); + hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter); + hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN], + ctx->chunk.chunk_counter + (subtree_chunks / 2)); + } + ctx->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + /* + * If there's any remaining input less than a full chunk, add it to + * the chunk state. In that case, also do a final merge loop to make + * sure the subtree stack doesn't contain any unmerged pairs. The + * remaining input means we know these merges are non-root. This merge + * loop isn't strictly necessary here, because hasher_push_chunk_cv + * already does its own merge loop, but it simplifies + * blake3_hasher_finalize below. + */ + if (input_len > 0) { + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, + input_len); + hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter); + } +} + +void +Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo) +{ + size_t done = 0; + const uint8_t *data = input; + const size_t block_max = 1024 * 64; + + /* max feed buffer to leave the stack size small */ + while (todo != 0) { + size_t block = (todo >= block_max) ? block_max : todo; + Blake3_Update2(ctx, data + done, block); + done += block; + todo -= block; + } +} + +void +Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out) +{ + Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN); +} + +void +Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, + size_t out_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector<uint8_t> v; + * blake3_hasher_finalize(&hasher, v.data(), v.size()); + */ + if (out_len == 0) { + return; + } + /* If the subtree stack is empty, then the current chunk is the root. */ + if (ctx->cv_stack_len == 0) { + output_t output = chunk_state_output(&ctx->chunk); + output_root_bytes(ctx->ops, &output, seek, out, out_len); + return; + } + /* + * If there are any bytes in the chunk state, finalize that chunk and + * do a roll-up merge between that chunk hash and every subtree in the + * stack. In this case, the extra merge loop at the end of + * blake3_hasher_update guarantees that none of the subtrees in the + * stack need to be merged with each other first. Otherwise, if there + * are no bytes in the chunk state, then the top of the stack is a + * chunk hash, and we start the merge from that. + */ + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&ctx->chunk) > 0) { + cvs_remaining = ctx->cv_stack_len; + output = chunk_state_output(&ctx->chunk); + } else { + /* There are always at least 2 CVs in the stack in this case. */ + cvs_remaining = ctx->cv_stack_len - 2; + output = parent_output(&ctx->cv_stack[cvs_remaining * 32], + ctx->key, ctx->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(ctx->ops, &output, &parent_block[32]); + output = parent_output(parent_block, ctx->key, + ctx->chunk.flags); + } + output_root_bytes(ctx->ops, &output, seek, out, out_len); +} diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c new file mode 100644 index 000000000..6ff9a845c --- /dev/null +++ b/module/icp/algs/blake3/blake3_generic.c @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include "blake3_impl.h" + +#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) +{ + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +static inline void round_fn(uint32_t state[16], const uint32_t *msg, + size_t round) +{ + /* Select the message schedule based on the round. */ + const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round]; + + /* Mix the columns. */ + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + /* Mix the rows. */ + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +static inline void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = BLAKE3_IV[0]; + state[9] = BLAKE3_IV[1]; + state[10] = BLAKE3_IV[2]; + state[11] = BLAKE3_IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +static inline void blake3_compress_in_place_generic(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +static inline void hash_one_generic(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) +{ + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN, + counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +static inline void blake3_compress_xof_generic(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +static inline void blake3_hash_many_generic(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, + boolean_t increment_counter, uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) +{ + while (num_inputs > 0) { + hash_one_generic(inputs[0], blocks, key, counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + +static inline boolean_t blake3_is_generic_supported(void) +{ + return (B_TRUE); +} + +const blake3_impl_ops_t blake3_generic_impl = { + .compress_in_place = blake3_compress_in_place_generic, + .compress_xof = blake3_compress_xof_generic, + .hash_many = blake3_hash_many_generic, + .is_supported = blake3_is_generic_supported, + .degree = 4, + .name = "generic" +}; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c new file mode 100644 index 000000000..c3268ec13 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.c @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include <sys/zio_checksum.h> + +#include "blake3_impl.h" + +static const blake3_impl_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; + +/* this pointer holds current ops for implementation */ +static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; + +/* special implementation selections */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) +#define IMPL_USER (UINT32_MAX-2) +#define IMPL_PARAM (UINT32_MAX-3) + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) +static uint32_t icp_blake3_impl = IMPL_FASTEST; + +#define BLAKE3_IMPL_NAME_MAX 16 + +/* id of fastest implementation */ +static uint32_t blake3_fastest_id = 0; + +/* currently used id */ +static uint32_t blake3_current_id = 0; + +/* id of module parameter (-1 == unused) */ +static int blake3_param_id = -1; + +/* return number of supported implementations */ +int +blake3_get_impl_count(void) +{ + static int impls = 0; + int i; + + if (impls) + return (impls); + + for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + impls++; + } + + return (impls); +} + +/* return id of selected implementation */ +int +blake3_get_impl_id(void) +{ + return (blake3_current_id); +} + +/* return name of selected implementation */ +const char * +blake3_get_impl_name(void) +{ + return (blake3_selected_impl->name); +} + +/* setup id as fastest implementation */ +void +blake3_set_impl_fastest(uint32_t id) +{ + blake3_fastest_id = id; +} + +/* set implementation by id */ +void +blake3_set_impl_id(uint32_t id) +{ + int i, cid; + + /* select fastest */ + if (id == IMPL_FASTEST) + id = blake3_fastest_id; + + /* select next or first */ + if (id == IMPL_CYCLE) + id = (++blake3_current_id) % blake3_get_impl_count(); + + /* 0..N for the real impl */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (cid == id) { + blake3_current_id = cid; + blake3_selected_impl = blake3_impls[i]; + return; + } + cid++; + } +} + +/* set implementation by name */ +int +blake3_set_impl_name(const char *name) +{ + int i, cid; + + if (strcmp(name, "fastest") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); + blake3_set_impl_id(IMPL_FASTEST); + return (0); + } else if (strcmp(name, "cycle") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); + blake3_set_impl_id(IMPL_CYCLE); + return (0); + } + + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (strcmp(name, blake3_impls[i]->name) == 0) { + if (icp_blake3_impl == IMPL_PARAM) { + blake3_param_id = cid; + return (0); + } + blake3_selected_impl = blake3_impls[i]; + blake3_current_id = cid; + return (0); + } + cid++; + } + + return (-EINVAL); +} + +/* setup implementation */ +void +blake3_setup_impl(void) +{ + switch (IMPL_READ(icp_blake3_impl)) { + case IMPL_PARAM: + blake3_set_impl_id(blake3_param_id); + atomic_swap_32(&icp_blake3_impl, IMPL_USER); + break; + case IMPL_FASTEST: + blake3_set_impl_id(IMPL_FASTEST); + break; + case IMPL_CYCLE: + blake3_set_impl_id(IMPL_CYCLE); + break; + default: + blake3_set_impl_id(blake3_current_id); + break; + } +} + +/* return selected implementation */ +const blake3_impl_ops_t * +blake3_impl_get_ops(void) +{ + /* each call to ops will cycle */ + if (icp_blake3_impl == IMPL_CYCLE) + blake3_set_impl_id(IMPL_CYCLE); + + return (blake3_selected_impl); +} + +#if defined(_KERNEL) && defined(__linux__) +static int +icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) +{ + char req_name[BLAKE3_IMPL_NAME_MAX]; + size_t i; + + /* sanitize input */ + i = strnlen(name, BLAKE3_IMPL_NAME_MAX); + if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) + return (-EINVAL); + + strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); + while (i > 0 && isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); + return (blake3_set_impl_name(req_name)); +} + +static int +icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cid, cnt = 0; + char *fmt; + + /* cycling */ + fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; + cnt += sprintf(buffer + cnt, fmt); + + /* fastest one */ + fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; + cnt += sprintf(buffer + cnt, fmt); + + /* user selected */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + fmt = (icp_blake3_impl == IMPL_USER && + cid == blake3_current_id) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); + cid++; + } + + buffer[cnt] = 0; + + return (cnt); +} + +module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, + NULL, 0644); +MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +#endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h new file mode 100644 index 000000000..7b40cc4d3 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/blake3.h> +#include <sys/simd.h> + +/* + * Methods used to define BLAKE3 assembler implementations + */ +typedef void (*blake3_compress_in_place_f)(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +typedef void (*blake3_compress_xof_f)(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +typedef boolean_t (*blake3_is_supported_f)(void); + +typedef struct blake3_impl_ops { + blake3_compress_in_place_f compress_in_place; + blake3_compress_xof_f compress_xof; + blake3_hash_many_f hash_many; + blake3_is_supported_f is_supported; + int degree; + const char *name; +} blake3_impl_ops_t; + +/* Return selected BLAKE3 implementation ops */ +extern const blake3_impl_ops_t *blake3_impl_get_ops(void); + +extern const blake3_impl_ops_t blake3_generic_impl; + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse2_impl; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse41_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern const blake3_impl_ops_t blake3_avx2_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern const blake3_impl_ops_t blake3_avx512_impl; +#endif + +#if defined(__x86_64) +#define MAX_SIMD_DEGREE 16 +#else +#define MAX_SIMD_DEGREE 4 +#endif + +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t BLAKE3_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +static inline unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (63 ^ __builtin_clzll(x)); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return (index); +#elif defined(_MSC_VER) && defined(IS_X86_32) + if (x >> 32) { + unsigned long index; + _BitScanReverse(&index, x >> 32); + return (32 + index); + } else { + unsigned long index; + _BitScanReverse(&index, x); + return (index); + } +#else + unsigned int c = 0; + if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if (x & 0x0000000000000002ULL) { c += 1; } + return (c); +#endif +} + +/* Count the number of 1 bits. */ +static inline unsigned int popcnt(uint64_t x) { + unsigned int count = 0; + + while (x != 0) { + count += 1; + x &= x - 1; + } + + return (count); +} + +/* + * Largest power of two less than or equal to x. + * As a special case, returns 1 when x is 0. + */ +static inline uint64_t round_down_to_power_of_2(uint64_t x) { + return (1ULL << highest_one(x | 1)); +} + +static inline uint32_t counter_low(uint64_t counter) { + return ((uint32_t)counter); +} + +static inline uint32_t counter_high(uint64_t counter) { + return ((uint32_t)(counter >> 32)); +} + +static inline uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +static inline void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_IMPL_H */ diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c new file mode 100644 index 000000000..48715e212 --- /dev/null +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include "blake3_impl.h" + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse2_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse41_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx2_supported(void) +{ + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); +} + +const blake3_impl_ops_t blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx512_supported(void) +{ + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); +} + +const blake3_impl_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S new file mode 100644 index 000000000..59a4d9afd --- /dev/null +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -0,0 +1,2450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE2 -> ARMv8-A + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if defined(__aarch64__) + .text + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI0_1: + .xword 0 + .xword -4294967296 +.LCPI0_2: + .xword -1 + .xword 4294967295 + .text + .globl zfs_blake3_compress_in_place_sse2 + .p2align 2 + .type zfs_blake3_compress_in_place_sse2,@function +zfs_blake3_compress_in_place_sse2: + .cfi_startproc + ldp q3, q2, [x0] + ldp q5, q6, [x1] + add x10, x1, #32 + lsr x11, x3, #32 + fmov s4, w3 + ld2 { v17.4s, v18.4s }, [x10] + adrp x10, .LCPI0_2 + and w8, w2, #0xff + mov v4.s[1], w11 + ldr q1, [x10, :lo12:.LCPI0_2] + and w9, w4, #0xff + adrp x12, .LCPI0_0 + mov v4.s[2], w8 + uzp1 v19.4s, v5.4s, v6.4s + add v3.4s, v2.4s, v3.4s + ldr q7, [x12, :lo12:.LCPI0_0] + mov v4.s[3], w9 + add v3.4s, v3.4s, v19.4s + uzp2 v5.4s, v5.4s, v6.4s + ext v21.16b, v18.16b, v18.16b, #12 + uzp1 v6.4s, v19.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #12 + eor v4.16b, v3.16b, v4.16b + ext v20.16b, v17.16b, v17.16b, #12 + ext v6.16b, v6.16b, v19.16b, #8 + ext v19.16b, v19.16b, v22.16b, #12 + zip1 v22.2d, v21.2d, v5.2d + rev32 v24.8h, v4.8h + mov v4.16b, v1.16b + zip2 v23.4s, v5.4s, v21.4s + uzp2 v6.4s, v6.4s, v5.4s + bsl v4.16b, v22.16b, v20.16b + add v3.4s, v3.4s, v5.4s + zip1 v5.4s, v23.4s, v20.4s + zip1 v22.4s, v20.4s, v23.4s + add v23.4s, v24.4s, v7.4s + ext v7.16b, v6.16b, v6.16b, #4 + ext v25.16b, v4.16b, v4.16b, #12 + ext v5.16b, v22.16b, v5.16b, #8 + eor v2.16b, v23.16b, v2.16b + uzp1 v4.4s, v4.4s, v25.4s + uzp1 v22.4s, v7.4s, v7.4s + ext v25.16b, v7.16b, v7.16b, #12 + ext v22.16b, v22.16b, v7.16b, #8 + ext v7.16b, v7.16b, v25.16b, #12 + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + add v3.4s, v3.4s, v2.4s + eor v24.16b, v3.16b, v24.16b + add v3.4s, v3.4s, v17.4s + ushr v17.4s, v24.4s, #8 + shl v18.4s, v24.4s, #24 + orr v17.16b, v18.16b, v17.16b + add v18.4s, v17.4s, v23.4s + eor v2.16b, v18.16b, v2.16b + ushr v23.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v3.16b, v3.16b, v3.16b, #12 + orr v2.16b, v2.16b, v23.16b + ext v17.16b, v17.16b, v17.16b, #8 + add v3.4s, v2.4s, v3.4s + adrp x11, .LCPI0_1 + eor v17.16b, v3.16b, v17.16b + ldr q16, [x11, :lo12:.LCPI0_1] + ext v18.16b, v18.16b, v18.16b, #4 + rev32 v24.8h, v17.8h + movi v0.2d, #0xffffffff00000000 + add v23.4s, v3.4s, v21.4s + mov v21.s[1], v20.s[2] + add v20.4s, v18.4s, v24.4s + bit v19.16b, v21.16b, v0.16b + eor v3.16b, v20.16b, v2.16b + uzp2 v2.4s, v22.4s, v19.4s + zip1 v17.2d, v5.2d, v19.2d + zip2 v18.4s, v19.4s, v5.4s + ushr v21.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + ext v22.16b, v2.16b, v2.16b, #4 + bsl v16.16b, v4.16b, v17.16b + zip1 v17.4s, v18.4s, v4.4s + zip1 v18.4s, v4.4s, v18.4s + orr v21.16b, v3.16b, v21.16b + ext v25.16b, v16.16b, v16.16b, #12 + ext v3.16b, v18.16b, v17.16b, #8 + uzp1 v18.4s, v22.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #12 + add v23.4s, v23.4s, v21.4s + uzp1 v17.4s, v16.4s, v25.4s + ext v16.16b, v18.16b, v22.16b, #8 + ext v18.16b, v22.16b, v26.16b, #12 + eor v22.16b, v23.16b, v24.16b + add v6.4s, v23.4s, v6.4s + ushr v23.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v23.16b + add v20.4s, v22.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ushr v23.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v21.16b, v21.16b, v23.16b + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v21.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v20.16b, v20.16b, v20.16b, #12 + add v6.4s, v6.4s, v19.4s + rev32 v19.8h, v22.8h + add v20.4s, v20.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ushr v22.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v22.16b + add v20.4s, v19.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #12 + ushr v22.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v21.16b, v21.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ext v20.16b, v20.16b, v20.16b, #4 + rev32 v19.8h, v19.8h + add v20.4s, v20.4s, v19.4s + add v6.4s, v6.4s, v5.4s + mov v5.s[1], v4.s[2] + eor v4.16b, v20.16b, v21.16b + ushr v21.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v21.16b, v4.16b, v21.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + add v2.4s, v6.4s, v2.4s + ushr v6.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v6.16b, v19.16b, v6.16b + add v19.4s, v6.4s, v20.4s + eor v20.16b, v19.16b, v21.16b + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v20.4s, v2.4s + eor v6.16b, v2.16b, v6.16b + ext v19.16b, v19.16b, v19.16b, #12 + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v6.4s + mov v22.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + bsl v22.16b, v5.16b, v7.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + add v2.4s, v2.4s, v22.4s + orr v20.16b, v20.16b, v21.16b + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + ushr v21.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v6.16b, v6.16b, v21.16b + add v19.4s, v6.4s, v19.4s + eor v20.16b, v19.16b, v20.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v2.4s, v2.4s, v17.4s + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + uzp2 v5.4s, v16.4s, v22.4s + zip1 v7.2d, v3.2d, v22.2d + zip2 v16.4s, v22.4s, v3.4s + ext v19.16b, v19.16b, v19.16b, #4 + rev32 v22.8h, v6.8h + ext v23.16b, v5.16b, v5.16b, #4 + bif v7.16b, v17.16b, v1.16b + zip1 v24.4s, v16.4s, v17.4s + zip1 v16.4s, v17.4s, v16.4s + add v21.4s, v2.4s, v3.4s + mov v3.s[1], v17.s[2] + add v17.4s, v19.4s, v22.4s + mov v19.16b, v0.16b + ext v25.16b, v7.16b, v7.16b, #12 + ext v4.16b, v16.16b, v24.16b, #8 + uzp1 v16.4s, v23.4s, v23.4s + bsl v19.16b, v3.16b, v18.16b + eor v2.16b, v17.16b, v20.16b + uzp1 v7.4s, v7.4s, v25.4s + ext v25.16b, v16.16b, v23.16b, #8 + zip1 v3.2d, v4.2d, v19.2d + ushr v20.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ext v24.16b, v23.16b, v23.16b, #12 + uzp2 v6.4s, v25.4s, v19.4s + zip2 v18.4s, v19.4s, v4.4s + bif v3.16b, v7.16b, v1.16b + orr v20.16b, v2.16b, v20.16b + ext v16.16b, v23.16b, v24.16b, #12 + ext v23.16b, v6.16b, v6.16b, #4 + zip1 v24.4s, v18.4s, v7.4s + zip1 v18.4s, v7.4s, v18.4s + ext v25.16b, v3.16b, v3.16b, #12 + add v21.4s, v21.4s, v20.4s + ext v2.16b, v18.16b, v24.16b, #8 + uzp1 v18.4s, v23.4s, v23.4s + ext v24.16b, v23.16b, v23.16b, #12 + uzp1 v3.4s, v3.4s, v25.4s + eor v22.16b, v21.16b, v22.16b + ext v25.16b, v18.16b, v23.16b, #8 + dup v18.4s, v2.s[3] + ext v23.16b, v23.16b, v24.16b, #12 + add v5.4s, v21.4s, v5.4s + trn1 v21.4s, v3.4s, v3.4s + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v18.16b, v21.16b, v18.16b, #8 + orr v21.16b, v22.16b, v24.16b + add v17.4s, v21.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ushr v22.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v20.16b, v22.16b + ext v21.16b, v21.16b, v21.16b, #8 + add v5.4s, v20.4s, v5.4s + eor v21.16b, v5.16b, v21.16b + ext v17.16b, v17.16b, v17.16b, #12 + add v5.4s, v5.4s, v19.4s + rev32 v19.8h, v21.8h + add v17.4s, v17.4s, v19.4s + eor v20.16b, v17.16b, v20.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v21.16b + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ushr v21.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v21.16b + add v17.4s, v19.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v5.4s, v5.4s, v7.4s + orr v20.16b, v20.16b, v21.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ext v17.16b, v17.16b, v17.16b, #4 + rev32 v22.8h, v19.8h + add v21.4s, v5.4s, v4.4s + mov v4.s[1], v7.s[2] + add v19.4s, v17.4s, v22.4s + bit v16.16b, v4.16b, v0.16b + eor v5.16b, v19.16b, v20.16b + uzp2 v4.4s, v25.4s, v16.4s + zip1 v7.2d, v2.2d, v16.2d + zip2 v17.4s, v16.4s, v2.4s + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v24.16b, v4.16b, v4.16b, #4 + bif v7.16b, v3.16b, v1.16b + zip1 v25.4s, v17.4s, v3.4s + zip1 v17.4s, v3.4s, v17.4s + orr v20.16b, v5.16b, v20.16b + ext v26.16b, v7.16b, v7.16b, #12 + ext v5.16b, v17.16b, v25.16b, #8 + uzp1 v17.4s, v24.4s, v24.4s + ext v25.16b, v24.16b, v24.16b, #12 + bit v23.16b, v18.16b, v0.16b + add v21.4s, v21.4s, v20.4s + uzp1 v7.4s, v7.4s, v26.4s + ext v26.16b, v17.16b, v24.16b, #8 + ext v17.16b, v24.16b, v25.16b, #12 + eor v22.16b, v21.16b, v22.16b + add v6.4s, v21.4s, v6.4s + zip1 v21.2d, v5.2d, v23.2d + zip2 v24.4s, v23.4s, v5.4s + bif v21.16b, v7.16b, v1.16b + zip1 v1.4s, v24.4s, v7.4s + zip1 v24.4s, v7.4s, v24.4s + ext v1.16b, v24.16b, v1.16b, #8 + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v24.16b + add v19.4s, v22.4s, v19.4s + ext v24.16b, v21.16b, v21.16b, #12 + eor v20.16b, v19.16b, v20.16b + uzp1 v21.4s, v21.4s, v24.4s + ushr v24.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v20.16b, v20.16b, v24.16b + ext v6.16b, v6.16b, v6.16b, #4 + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v20.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #12 + add v6.4s, v6.4s, v16.4s + rev32 v16.8h, v22.8h + add v19.4s, v19.4s, v16.4s + eor v20.16b, v19.16b, v20.16b + ushr v22.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v22.16b + add v6.4s, v6.4s, v20.4s + eor v16.16b, v6.16b, v16.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v6.4s, v3.4s + ushr v6.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v6.16b, v16.16b, v6.16b + add v16.4s, v6.4s, v19.4s + eor v19.16b, v16.16b, v20.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v20.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v3.4s, v3.4s, v19.4s + eor v6.16b, v3.16b, v6.16b + ext v16.16b, v16.16b, v16.16b, #4 + add v2.4s, v3.4s, v2.4s + rev32 v3.8h, v6.8h + add v6.4s, v16.4s, v3.4s + eor v16.16b, v6.16b, v19.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + orr v16.16b, v16.16b, v19.16b + add v2.4s, v2.4s, v16.4s + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v4.4s + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v4.16b + add v4.4s, v3.4s, v6.4s + eor v6.16b, v4.16b, v16.16b + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v6.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v6.16b, v4.16b, v6.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v2.4s, v2.4s, v23.4s + orr v6.16b, v6.16b, v16.16b + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + add v4.4s, v3.4s, v4.4s + eor v6.16b, v4.16b, v6.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v2.4s, v2.4s, v7.4s + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #4 + rev32 v3.8h, v3.8h + add v2.4s, v2.4s, v5.4s + mov v5.s[1], v7.s[2] + add v4.4s, v4.4s, v3.4s + bsl v0.16b, v5.16b, v17.16b + eor v5.16b, v4.16b, v6.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v6.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v6.16b + add v4.4s, v3.4s, v4.4s + uzp2 v18.4s, v26.4s, v18.4s + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v18.4s + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v5.16b, v5.16b, v6.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v5.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v0.4s, v2.4s, v0.4s + rev32 v2.8h, v3.8h + add v3.4s, v4.4s, v2.4s + eor v4.16b, v3.16b, v5.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v5.16b + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ushr v5.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v2.16b, v2.16b, v5.16b + add v3.4s, v2.4s, v3.4s + eor v4.16b, v3.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #12 + ushr v5.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v0.4s, v0.4s, v21.4s + orr v4.16b, v4.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ext v3.16b, v3.16b, v3.16b, #4 + add v0.4s, v0.4s, v1.4s + rev32 v1.8h, v2.8h + add v2.4s, v3.4s, v1.4s + eor v3.16b, v2.16b, v4.16b + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v3.4s + eor v1.16b, v0.16b, v1.16b + ushr v4.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + orr v1.16b, v1.16b, v4.16b + add v2.4s, v1.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v0.16b, v0.16b, v0.16b, #4 + ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v1.16b, v1.16b, v1.16b, #8 + eor v0.16b, v2.16b, v0.16b + orr v2.16b, v3.16b, v4.16b + eor v1.16b, v2.16b, v1.16b + stp q0, q1, [x0] + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI1_1: + .xword 0 + .xword -4294967296 +.LCPI1_2: + .xword -1 + .xword 4294967295 + .text + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: + .cfi_startproc + ldp q3, q2, [x0] + ldp q5, q6, [x1] + add x10, x1, #32 + lsr x11, x3, #32 + fmov s4, w3 + ld2 { v17.4s, v18.4s }, [x10] + adrp x10, .LCPI1_2 + and w8, w2, #0xff + mov v4.s[1], w11 + ldr q1, [x10, :lo12:.LCPI1_2] + and w9, w4, #0xff + adrp x12, .LCPI1_0 + mov v4.s[2], w8 + uzp1 v19.4s, v5.4s, v6.4s + add v3.4s, v2.4s, v3.4s + ldr q7, [x12, :lo12:.LCPI1_0] + mov v4.s[3], w9 + add v3.4s, v3.4s, v19.4s + uzp2 v5.4s, v5.4s, v6.4s + ext v21.16b, v18.16b, v18.16b, #12 + uzp1 v6.4s, v19.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #12 + eor v4.16b, v3.16b, v4.16b + ext v20.16b, v17.16b, v17.16b, #12 + ext v6.16b, v6.16b, v19.16b, #8 + ext v19.16b, v19.16b, v22.16b, #12 + zip1 v22.2d, v21.2d, v5.2d + rev32 v24.8h, v4.8h + mov v4.16b, v1.16b + zip2 v23.4s, v5.4s, v21.4s + uzp2 v6.4s, v6.4s, v5.4s + bsl v4.16b, v22.16b, v20.16b + add v3.4s, v3.4s, v5.4s + zip1 v5.4s, v23.4s, v20.4s + zip1 v22.4s, v20.4s, v23.4s + add v23.4s, v24.4s, v7.4s + ext v7.16b, v6.16b, v6.16b, #4 + ext v25.16b, v4.16b, v4.16b, #12 + ext v5.16b, v22.16b, v5.16b, #8 + eor v2.16b, v23.16b, v2.16b + uzp1 v4.4s, v4.4s, v25.4s + uzp1 v22.4s, v7.4s, v7.4s + ext v25.16b, v7.16b, v7.16b, #12 + ext v22.16b, v22.16b, v7.16b, #8 + ext v7.16b, v7.16b, v25.16b, #12 + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + add v3.4s, v3.4s, v2.4s + eor v24.16b, v3.16b, v24.16b + add v3.4s, v3.4s, v17.4s + ushr v17.4s, v24.4s, #8 + shl v18.4s, v24.4s, #24 + orr v17.16b, v18.16b, v17.16b + add v18.4s, v17.4s, v23.4s + eor v2.16b, v18.16b, v2.16b + ushr v23.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v3.16b, v3.16b, v3.16b, #12 + orr v2.16b, v2.16b, v23.16b + ext v17.16b, v17.16b, v17.16b, #8 + add v3.4s, v2.4s, v3.4s + adrp x11, .LCPI1_1 + eor v17.16b, v3.16b, v17.16b + ldr q16, [x11, :lo12:.LCPI1_1] + ext v18.16b, v18.16b, v18.16b, #4 + rev32 v24.8h, v17.8h + movi v0.2d, #0xffffffff00000000 + add v23.4s, v3.4s, v21.4s + mov v21.s[1], v20.s[2] + add v20.4s, v18.4s, v24.4s + bit v19.16b, v21.16b, v0.16b + eor v3.16b, v20.16b, v2.16b + uzp2 v2.4s, v22.4s, v19.4s + zip1 v17.2d, v5.2d, v19.2d + zip2 v18.4s, v19.4s, v5.4s + ushr v21.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + ext v22.16b, v2.16b, v2.16b, #4 + bsl v16.16b, v4.16b, v17.16b + zip1 v17.4s, v18.4s, v4.4s + zip1 v18.4s, v4.4s, v18.4s + orr v21.16b, v3.16b, v21.16b + ext v25.16b, v16.16b, v16.16b, #12 + ext v3.16b, v18.16b, v17.16b, #8 + uzp1 v18.4s, v22.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #12 + add v23.4s, v23.4s, v21.4s + uzp1 v17.4s, v16.4s, v25.4s + ext v16.16b, v18.16b, v22.16b, #8 + ext v18.16b, v22.16b, v26.16b, #12 + eor v22.16b, v23.16b, v24.16b + add v6.4s, v23.4s, v6.4s + ushr v23.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v23.16b + add v20.4s, v22.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ushr v23.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v21.16b, v21.16b, v23.16b + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v21.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v20.16b, v20.16b, v20.16b, #12 + add v6.4s, v6.4s, v19.4s + rev32 v19.8h, v22.8h + add v20.4s, v20.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ushr v22.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v22.16b + add v20.4s, v19.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #12 + ushr v22.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v21.16b, v21.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ext v20.16b, v20.16b, v20.16b, #4 + rev32 v19.8h, v19.8h + add v20.4s, v20.4s, v19.4s + add v6.4s, v6.4s, v5.4s + mov v5.s[1], v4.s[2] + eor v4.16b, v20.16b, v21.16b + ushr v21.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v21.16b, v4.16b, v21.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + add v2.4s, v6.4s, v2.4s + ushr v6.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v6.16b, v19.16b, v6.16b + add v19.4s, v6.4s, v20.4s + eor v20.16b, v19.16b, v21.16b + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v20.4s, v2.4s + eor v6.16b, v2.16b, v6.16b + ext v19.16b, v19.16b, v19.16b, #12 + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v6.4s + mov v22.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + bsl v22.16b, v5.16b, v7.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + add v2.4s, v2.4s, v22.4s + orr v20.16b, v20.16b, v21.16b + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + ushr v21.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v6.16b, v6.16b, v21.16b + add v19.4s, v6.4s, v19.4s + eor v20.16b, v19.16b, v20.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v2.4s, v2.4s, v17.4s + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + uzp2 v5.4s, v16.4s, v22.4s + zip1 v7.2d, v3.2d, v22.2d + zip2 v16.4s, v22.4s, v3.4s + ext v19.16b, v19.16b, v19.16b, #4 + rev32 v22.8h, v6.8h + ext v23.16b, v5.16b, v5.16b, #4 + bif v7.16b, v17.16b, v1.16b + zip1 v24.4s, v16.4s, v17.4s + zip1 v16.4s, v17.4s, v16.4s + add v21.4s, v2.4s, v3.4s + mov v3.s[1], v17.s[2] + add v17.4s, v19.4s, v22.4s + mov v19.16b, v0.16b + ext v25.16b, v7.16b, v7.16b, #12 + ext v4.16b, v16.16b, v24.16b, #8 + uzp1 v16.4s, v23.4s, v23.4s + bsl v19.16b, v3.16b, v18.16b + eor v2.16b, v17.16b, v20.16b + uzp1 v7.4s, v7.4s, v25.4s + ext v25.16b, v16.16b, v23.16b, #8 + zip1 v3.2d, v4.2d, v19.2d + ushr v20.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ext v24.16b, v23.16b, v23.16b, #12 + uzp2 v6.4s, v25.4s, v19.4s + zip2 v18.4s, v19.4s, v4.4s + bif v3.16b, v7.16b, v1.16b + orr v20.16b, v2.16b, v20.16b + ext v16.16b, v23.16b, v24.16b, #12 + ext v23.16b, v6.16b, v6.16b, #4 + zip1 v24.4s, v18.4s, v7.4s + zip1 v18.4s, v7.4s, v18.4s + ext v25.16b, v3.16b, v3.16b, #12 + add v21.4s, v21.4s, v20.4s + ext v2.16b, v18.16b, v24.16b, #8 + uzp1 v18.4s, v23.4s, v23.4s + ext v24.16b, v23.16b, v23.16b, #12 + uzp1 v3.4s, v3.4s, v25.4s + eor v22.16b, v21.16b, v22.16b + ext v25.16b, v18.16b, v23.16b, #8 + dup v18.4s, v2.s[3] + ext v23.16b, v23.16b, v24.16b, #12 + add v5.4s, v21.4s, v5.4s + trn1 v21.4s, v3.4s, v3.4s + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v18.16b, v21.16b, v18.16b, #8 + orr v21.16b, v22.16b, v24.16b + add v17.4s, v21.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ushr v22.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v20.16b, v22.16b + ext v21.16b, v21.16b, v21.16b, #8 + add v5.4s, v20.4s, v5.4s + eor v21.16b, v5.16b, v21.16b + ext v17.16b, v17.16b, v17.16b, #12 + add v5.4s, v5.4s, v19.4s + rev32 v19.8h, v21.8h + add v17.4s, v17.4s, v19.4s + eor v20.16b, v17.16b, v20.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v21.16b + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ushr v21.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v21.16b + add v17.4s, v19.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v5.4s, v5.4s, v7.4s + orr v20.16b, v20.16b, v21.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ext v17.16b, v17.16b, v17.16b, #4 + rev32 v22.8h, v19.8h + add v21.4s, v5.4s, v4.4s + mov v4.s[1], v7.s[2] + add v19.4s, v17.4s, v22.4s + bit v16.16b, v4.16b, v0.16b + eor v5.16b, v19.16b, v20.16b + uzp2 v4.4s, v25.4s, v16.4s + zip1 v7.2d, v2.2d, v16.2d + zip2 v17.4s, v16.4s, v2.4s + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v24.16b, v4.16b, v4.16b, #4 + bif v7.16b, v3.16b, v1.16b + zip1 v25.4s, v17.4s, v3.4s + zip1 v17.4s, v3.4s, v17.4s + orr v20.16b, v5.16b, v20.16b + ext v26.16b, v7.16b, v7.16b, #12 + ext v5.16b, v17.16b, v25.16b, #8 + uzp1 v17.4s, v24.4s, v24.4s + ext v25.16b, v24.16b, v24.16b, #12 + bit v23.16b, v18.16b, v0.16b + add v21.4s, v21.4s, v20.4s + uzp1 v7.4s, v7.4s, v26.4s + ext v26.16b, v17.16b, v24.16b, #8 + ext v17.16b, v24.16b, v25.16b, #12 + eor v22.16b, v21.16b, v22.16b + add v6.4s, v21.4s, v6.4s + zip1 v21.2d, v5.2d, v23.2d + zip2 v24.4s, v23.4s, v5.4s + bif v21.16b, v7.16b, v1.16b + zip1 v1.4s, v24.4s, v7.4s + zip1 v24.4s, v7.4s, v24.4s + ext v1.16b, v24.16b, v1.16b, #8 + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v24.16b + add v19.4s, v22.4s, v19.4s + ext v24.16b, v21.16b, v21.16b, #12 + eor v20.16b, v19.16b, v20.16b + uzp1 v21.4s, v21.4s, v24.4s + ushr v24.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v20.16b, v20.16b, v24.16b + ext v6.16b, v6.16b, v6.16b, #4 + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v20.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #12 + add v6.4s, v6.4s, v16.4s + rev32 v16.8h, v22.8h + add v19.4s, v19.4s, v16.4s + eor v20.16b, v19.16b, v20.16b + ushr v22.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v22.16b + add v6.4s, v6.4s, v20.4s + eor v16.16b, v6.16b, v16.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v6.4s, v3.4s + ushr v6.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v6.16b, v16.16b, v6.16b + add v16.4s, v6.4s, v19.4s + eor v19.16b, v16.16b, v20.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v20.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v3.4s, v3.4s, v19.4s + eor v6.16b, v3.16b, v6.16b + ext v16.16b, v16.16b, v16.16b, #4 + add v2.4s, v3.4s, v2.4s + rev32 v3.8h, v6.8h + add v6.4s, v16.4s, v3.4s + eor v16.16b, v6.16b, v19.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + orr v16.16b, v16.16b, v19.16b + add v2.4s, v2.4s, v16.4s + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v4.4s + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v4.16b + add v4.4s, v3.4s, v6.4s + eor v6.16b, v4.16b, v16.16b + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v6.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v6.16b, v4.16b, v6.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v2.4s, v2.4s, v23.4s + orr v6.16b, v6.16b, v16.16b + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + add v4.4s, v3.4s, v4.4s + eor v6.16b, v4.16b, v6.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v2.4s, v2.4s, v7.4s + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #4 + rev32 v3.8h, v3.8h + add v2.4s, v2.4s, v5.4s + mov v5.s[1], v7.s[2] + add v4.4s, v4.4s, v3.4s + bsl v0.16b, v5.16b, v17.16b + eor v5.16b, v4.16b, v6.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v6.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v6.16b + add v4.4s, v3.4s, v4.4s + uzp2 v18.4s, v26.4s, v18.4s + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v18.4s + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v5.16b, v5.16b, v6.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v5.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v0.4s, v2.4s, v0.4s + rev32 v2.8h, v3.8h + add v3.4s, v4.4s, v2.4s + eor v4.16b, v3.16b, v5.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v5.16b + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ushr v5.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v2.16b, v2.16b, v5.16b + add v3.4s, v2.4s, v3.4s + eor v4.16b, v3.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #12 + ushr v5.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v0.4s, v0.4s, v21.4s + orr v4.16b, v4.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ext v3.16b, v3.16b, v3.16b, #4 + add v0.4s, v0.4s, v1.4s + rev32 v1.8h, v2.8h + add v2.4s, v3.4s, v1.4s + eor v3.16b, v2.16b, v4.16b + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v3.4s + eor v1.16b, v0.16b, v1.16b + ushr v4.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + orr v1.16b, v1.16b, v4.16b + add v2.4s, v1.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v0.16b, v0.16b, v0.16b, #4 + ext v1.16b, v1.16b, v1.16b, #8 + ext v2.16b, v2.16b, v2.16b, #12 + orr v3.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + eor v3.16b, v3.16b, v1.16b + stp q0, q3, [x5] + ldr q0, [x0] + eor v0.16b, v0.16b, v2.16b + str q0, [x5, #32] + ldr q0, [x0, #16] + eor v0.16b, v0.16b, v1.16b + str q0, [x5, #48] + ret +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_0: + .word 0 + .word 1 + .word 2 + .word 3 + .text + .globl zfs_blake3_hash_many_sse2 + .p2align 2 + .type zfs_blake3_hash_many_sse2,@function +zfs_blake3_hash_many_sse2: + .cfi_startproc + stp d15, d14, [sp, #-160]! + stp d13, d12, [sp, #16] + stp d11, d10, [sp, #32] + stp d9, d8, [sp, #48] + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + mov x29, sp + sub sp, sp, #384 + .cfi_def_cfa w29, 160 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + ldr x26, [x29, #168] + ldrb w27, [x29, #160] + mov w19, w6 + mov x20, x4 + mov x22, x2 + mov x28, x1 + cmp x1, #4 + mov x24, x0 + str x3, [sp, #40] + b.lo .LBB2_8 + adrp x9, .LCPI2_0 + ldr q0, [x9, :lo12:.LCPI2_0] + sbfx w11, w5, #0, #1 + dup v1.4s, w11 + mov w9, #58983 + mov w10, #44677 + and v0.16b, v1.16b, v0.16b + mov w11, #62322 + mov w12, #62778 + orr w8, w7, w19 + movk w9, #27145, lsl #16 + movk w10, #47975, lsl #16 + movk w11, #15470, lsl #16 + str q0, [sp, #16] + orr v0.4s, #128, lsl #24 + movk w12, #42319, lsl #16 + str q0, [sp] +.LBB2_2: + ldr x0, [sp, #40] + mov x13, x0 + ld1r { v20.4s }, [x13], #4 + add x14, x0, #8 + add x15, x0, #12 + add x16, x0, #16 + add x17, x0, #20 + add x18, x0, #24 + add x0, x0, #28 + ld1r { v17.4s }, [x14] + ld1r { v6.4s }, [x15] + ld1r { v8.4s }, [x16] + ld1r { v9.4s }, [x17] + ld1r { v31.4s }, [x18] + ld1r { v26.4s }, [x13] + ld1r { v15.4s }, [x0] + cbz x22, .LBB2_7 + ldr q1, [sp, #16] + dup v0.4s, w20 + ldp x13, x14, [x24] + ldp x15, x16, [x24, #16] + add v1.4s, v0.4s, v1.4s + movi v0.4s, #128, lsl #24 + str q1, [sp, #64] + eor v0.16b, v1.16b, v0.16b + ldr q1, [sp] + lsr x18, x20, #32 + mov x17, xzr + cmgt v0.4s, v1.4s, v0.4s + dup v1.4s, w18 + sub v0.4s, v1.4s, v0.4s + mov w18, w8 + str q0, [sp, #48] +.LBB2_4: + mov w2, #16 + bfi x2, x17, #6, #58 + ldr q1, [x13, x2] + ldr q3, [x14, x2] + ldr q2, [x15, x2] + ldr q4, [x16, x2] + mov w2, #32 + bfi x2, x17, #6, #58 + ldr q5, [x13, x2] + ldr q18, [x14, x2] + ldr q19, [x15, x2] + ldr q23, [x16, x2] + mov w2, #48 + lsl x3, x17, #6 + bfi x2, x17, #6, #58 + add x17, x17, #1 + ldr q0, [x13, x3] + ldr q21, [x14, x3] + ldr q7, [x15, x3] + ldr q16, [x16, x3] + cmp x17, x22 + ldr q13, [x13, x2] + ldr q14, [x14, x2] + ldr q29, [x15, x2] + ldr q10, [x16, x2] + csel w2, w27, wzr, eq + orr w18, w2, w18 + mov x0, xzr + and w18, w18, #0xff + add x3, x3, #256 +.LBB2_5: + ldr x2, [x24, x0] + add x0, x0, #8 + cmp x0, #32 + add x2, x2, x3 + prfm pldl1keep, [x2] + b.ne .LBB2_5 + dup v22.4s, w18 + str q22, [sp, #192] + zip1 v27.4s, v0.4s, v21.4s + zip2 v21.4s, v0.4s, v21.4s + zip1 v0.4s, v7.4s, v16.4s + zip2 v22.4s, v7.4s, v16.4s + zip1 v7.4s, v1.4s, v3.4s + zip1 v25.4s, v2.4s, v4.4s + zip2 v16.4s, v2.4s, v4.4s + zip1 v11.4s, v19.4s, v23.4s + zip2 v12.4s, v19.4s, v23.4s + zip1 v19.4s, v13.4s, v14.4s + zip2 v23.4s, v13.4s, v14.4s + zip1 v13.4s, v29.4s, v10.4s + zip2 v14.4s, v29.4s, v10.4s + add v10.4s, v20.4s, v8.4s + add v2.4s, v26.4s, v9.4s + ext v20.16b, v22.16b, v21.16b, #8 + ext v26.16b, v25.16b, v7.16b, #8 + zip2 v24.4s, v1.4s, v3.4s + add v1.4s, v6.4s, v15.4s + ext v6.16b, v0.16b, v27.16b, #8 + ext v20.16b, v21.16b, v20.16b, #8 + mov v21.d[1], v22.d[0] + ext v22.16b, v7.16b, v26.16b, #8 + mov v7.d[1], v25.d[0] + add v3.4s, v17.4s, v31.4s + str q1, [sp, #144] + ext v1.16b, v27.16b, v6.16b, #8 + mov v6.16b, v7.16b + zip1 v28.4s, v5.4s, v18.4s + stur q1, [x29, #-80] + mov v1.16b, v27.16b + mov v27.16b, v24.16b + add v3.4s, v3.4s, v6.4s + ldr q6, [sp, #64] + ext v29.16b, v16.16b, v24.16b, #8 + mov v1.d[1], v0.d[0] + ext v0.16b, v11.16b, v28.16b, #8 + mov v27.d[1], v16.d[0] + ext v16.16b, v14.16b, v23.16b, #8 + stur q7, [x29, #-144] + ext v7.16b, v24.16b, v29.16b, #8 + ext v29.16b, v28.16b, v0.16b, #8 + ext v0.16b, v23.16b, v16.16b, #8 + mov v23.d[1], v14.d[0] + stp q0, q23, [sp, #80] + add v0.4s, v10.4s, v1.4s + eor v16.16b, v0.16b, v6.16b + ldr q6, [sp, #48] + add v2.4s, v2.4s, v21.4s + mov v28.d[1], v11.d[0] + zip2 v18.4s, v5.4s, v18.4s + eor v10.16b, v2.16b, v6.16b + movi v6.4s, #64 + eor v11.16b, v3.16b, v6.16b + ldr q6, [sp, #144] + dup v17.4s, w9 + ext v30.16b, v12.16b, v18.16b, #8 + rev32 v16.8h, v16.8h + dup v5.4s, w10 + ext v25.16b, v18.16b, v30.16b, #8 + mov v30.16b, v23.16b + mov v23.16b, v1.16b + str q1, [sp, #160] + rev32 v10.8h, v10.8h + add v1.4s, v16.4s, v17.4s + add v17.4s, v6.4s, v27.4s + ldr q6, [sp, #192] + dup v4.4s, w11 + rev32 v11.8h, v11.8h + add v5.4s, v10.4s, v5.4s + eor v8.16b, v1.16b, v8.16b + stur q21, [x29, #-128] + mov v18.d[1], v12.d[0] + add v4.4s, v11.4s, v4.4s + eor v9.16b, v5.16b, v9.16b + ushr v12.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + ldur q21, [x29, #-80] + ext v26.16b, v13.16b, v19.16b, #8 + eor v31.16b, v4.16b, v31.16b + orr v8.16b, v8.16b, v12.16b + ushr v12.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + ext v26.16b, v19.16b, v26.16b, #8 + mov v19.d[1], v13.d[0] + orr v9.16b, v9.16b, v12.16b + ushr v12.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v17.16b, v6.16b + orr v31.16b, v31.16b, v12.16b + dup v12.4s, w12 + rev32 v13.8h, v13.8h + add v12.4s, v13.4s, v12.4s + add v0.4s, v0.4s, v21.4s + eor v14.16b, v12.16b, v15.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v20.4s + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v22.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v28.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v18.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v19.4s + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v30.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + mov v24.16b, v7.16b + stur q7, [x29, #-112] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + mov v7.16b, v26.16b + add v3.4s, v3.4s, v26.4s + ldr q26, [sp, #80] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + add v0.4s, v0.4s, v29.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v25.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v13.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + str q22, [sp, #128] + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + ldur q22, [x29, #-128] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + eor v8.16b, v5.16b, v8.16b + mov v6.16b, v18.16b + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + ldur q18, [x29, #-144] + orr v8.16b, v8.16b, v15.16b + add v0.4s, v0.4s, v22.4s + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v20.4s + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v24.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v18.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v13.16b, v17.16b, v13.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v13.8h, v13.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v13.4s + add v0.4s, v0.4s, v27.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v6.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v23.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v21.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v19.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v29.4s + str q28, [sp, #112] + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldp q28, q23, [sp, #112] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ldr q21, [sp, #96] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + add v0.4s, v0.4s, v25.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v23.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v21.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v28.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v13.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + eor v8.16b, v5.16b, v8.16b + mov v30.16b, v29.16b + mov v29.16b, v25.16b + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + ldur q25, [x29, #-112] + orr v8.16b, v8.16b, v15.16b + add v0.4s, v0.4s, v20.4s + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v6.4s + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v7.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v25.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v13.16b, v17.16b, v13.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v13.8h, v13.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v13.4s + add v0.4s, v0.4s, v18.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v19.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v22.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v21.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v27.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v30.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v29.4s + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v28.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + ldr q24, [sp, #160] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + stur q7, [x29, #-64] + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + mov v7.16b, v26.16b + add v3.4s, v3.4s, v26.4s + ldur q26, [x29, #-80] + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + add v0.4s, v0.4s, v23.4s + orr v8.16b, v8.16b, v15.16b + add v15.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v24.4s + eor v0.16b, v15.16b, v13.16b + add v2.4s, v2.4s, v31.4s + ushr v13.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + orr v0.16b, v0.16b, v13.16b + ushr v13.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v13.16b + ushr v13.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v13.16b + ushr v13.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v13.16b + ushr v13.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + orr v9.16b, v9.16b, v13.16b + ushr v13.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + add v1.4s, v10.4s, v1.4s + orr v31.16b, v31.16b, v13.16b + eor v13.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + ushr v14.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v14.16b + ushr v14.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + stur q6, [x29, #-96] + orr v8.16b, v8.16b, v14.16b + add v14.4s, v15.4s, v6.4s + ldur q6, [x29, #-64] + mov v18.16b, v19.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v18.4s + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v21.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v6.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + str q27, [sp, #176] + mov v27.16b, v30.16b + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + add v14.4s, v14.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v27.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v20.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + mov v30.16b, v23.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v0.16b, v17.16b, v0.16b + add v1.4s, v16.4s, v1.4s + ldur q23, [x29, #-144] + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v0.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v23.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v29.4s + orr v13.16b, v13.16b, v15.16b + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v30.4s + rev32 v0.8h, v0.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + add v4.4s, v4.4s, v0.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldur q22, [x29, #-128] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + ldr q26, [sp, #176] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v5.4s, v11.4s + add v14.4s, v14.4s, v24.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v22.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v28.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v15.16b + add v14.4s, v14.4s, v18.4s + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v27.4s + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v7.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v21.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + add v14.4s, v14.4s, v6.4s + ldur q6, [x29, #-96] + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + stur q20, [x29, #-160] + mov v20.16b, v29.16b + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + mov v19.16b, v29.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v20.4s + mov v19.16b, v28.16b + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v6.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v19.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v0.16b, v17.16b, v0.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v0.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v25.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v30.4s + orr v13.16b, v13.16b, v15.16b + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v24.4s + rev32 v0.8h, v0.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + mov v29.16b, v27.16b + add v4.4s, v4.4s, v0.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldur q27, [x29, #-160] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ldur q6, [x29, #-80] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v5.4s, v11.4s + add v14.4s, v14.4s, v22.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v27.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v6.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v23.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v15.16b + add v14.4s, v14.4s, v29.4s + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v20.4s + mov v28.16b, v7.16b + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v19.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v28.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + add v14.4s, v14.4s, v21.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v30.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + add v3.4s, v3.4s, v18.4s + orr v10.16b, v10.16b, v15.16b + add v15.4s, v3.4s, v31.4s + eor v3.16b, v15.16b, v11.16b + ushr v11.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v11.16b, v3.16b, v11.16b + add v3.4s, v17.4s, v6.4s + add v17.4s, v3.4s, v13.4s + eor v0.16b, v17.16b, v0.16b + ushr v3.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + add v1.4s, v16.4s, v1.4s + orr v0.16b, v0.16b, v3.16b + eor v3.16b, v1.16b, v8.16b + ushr v8.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + add v5.4s, v10.4s, v5.4s + orr v8.16b, v3.16b, v8.16b + eor v3.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + ushr v9.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + eor v31.16b, v4.16b, v31.16b + mov v7.16b, v23.16b + mov v23.16b, v28.16b + mov v28.16b, v6.16b + orr v3.16b, v3.16b, v9.16b + ushr v9.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + ldur q6, [x29, #-64] + orr v31.16b, v31.16b, v9.16b + add v9.4s, v0.4s, v12.4s + eor v12.16b, v9.16b, v13.16b + ushr v13.4s, v12.4s, #7 + shl v12.4s, v12.4s, #25 + orr v12.16b, v12.16b, v13.16b + add v13.4s, v14.4s, v6.4s + add v13.4s, v13.4s, v3.4s + eor v0.16b, v13.16b, v0.16b + add v2.4s, v2.4s, v24.4s + rev32 v14.8h, v0.8h + add v0.4s, v2.4s, v31.4s + add v6.4s, v4.4s, v14.4s + eor v2.16b, v0.16b, v16.16b + eor v3.16b, v6.16b, v3.16b + rev32 v16.8h, v2.8h + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v2.4s, v9.4s, v16.4s + orr v4.16b, v3.16b, v4.16b + eor v3.16b, v2.16b, v31.16b + ushr v31.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v31.16b + add v31.4s, v15.4s, v22.4s + add v31.4s, v31.4s, v12.4s + add v17.4s, v17.4s, v7.4s + eor v9.16b, v31.16b, v10.16b + add v17.4s, v17.4s, v8.4s + rev32 v9.8h, v9.8h + eor v11.16b, v17.16b, v11.16b + add v1.4s, v1.4s, v9.4s + rev32 v11.8h, v11.8h + eor v10.16b, v1.16b, v12.16b + add v5.4s, v5.4s, v11.4s + ushr v12.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v8.16b, v5.16b, v8.16b + orr v10.16b, v10.16b, v12.16b + ushr v12.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + orr v8.16b, v8.16b, v12.16b + add v12.4s, v13.4s, v27.4s + add v12.4s, v12.4s, v4.4s + eor v13.16b, v12.16b, v14.16b + ldur q14, [x29, #-96] + mov v25.16b, v29.16b + add v29.4s, v12.4s, v20.4s + add v20.4s, v31.4s, v26.4s + add v0.4s, v0.4s, v14.4s + add v0.4s, v0.4s, v3.4s + eor v16.16b, v0.16b, v16.16b + add v0.4s, v0.4s, v30.4s + ldur q30, [x29, #-112] + add v20.4s, v20.4s, v10.4s + eor v31.16b, v20.16b, v9.16b + add v20.4s, v20.4s, v28.4s + add v17.4s, v17.4s, v30.4s + add v17.4s, v17.4s, v8.4s + eor v9.16b, v17.16b, v11.16b + ushr v28.4s, v13.4s, #8 + shl v11.4s, v13.4s, #24 + orr v28.16b, v11.16b, v28.16b + ushr v11.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v16.16b, v16.16b, v11.16b + ushr v11.4s, v31.4s, #8 + shl v31.4s, v31.4s, #24 + add v6.4s, v28.4s, v6.4s + orr v31.16b, v31.16b, v11.16b + ushr v11.4s, v9.4s, #8 + shl v9.4s, v9.4s, #24 + add v2.4s, v16.4s, v2.4s + eor v4.16b, v6.16b, v4.16b + orr v9.16b, v9.16b, v11.16b + add v1.4s, v31.4s, v1.4s + eor v3.16b, v2.16b, v3.16b + ushr v11.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v5.4s, v9.4s, v5.4s + eor v10.16b, v1.16b, v10.16b + orr v4.16b, v4.16b, v11.16b + ushr v11.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v3.16b, v3.16b, v11.16b + ushr v11.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + orr v10.16b, v10.16b, v11.16b + ushr v11.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v11.16b + add v29.4s, v29.4s, v8.4s + eor v16.16b, v29.16b, v16.16b + add v0.4s, v0.4s, v4.4s + mov v12.16b, v26.16b + add v17.4s, v17.4s, v19.4s + add v26.4s, v29.4s, v23.4s + eor v29.16b, v0.16b, v31.16b + add v20.4s, v20.4s, v3.4s + rev32 v16.8h, v16.8h + stur q18, [x29, #-176] + mov v18.16b, v27.16b + add v0.4s, v0.4s, v24.4s + eor v27.16b, v20.16b, v9.16b + add v17.4s, v17.4s, v10.4s + rev32 v24.8h, v29.8h + add v1.4s, v1.4s, v16.4s + add v20.4s, v20.4s, v25.4s + eor v25.16b, v17.16b, v28.16b + rev32 v27.8h, v27.8h + add v5.4s, v5.4s, v24.4s + eor v28.16b, v1.16b, v8.16b + rev32 v25.8h, v25.8h + add v6.4s, v6.4s, v27.4s + eor v4.16b, v5.16b, v4.16b + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + add v2.4s, v2.4s, v25.4s + eor v3.16b, v6.16b, v3.16b + orr v28.16b, v28.16b, v31.16b + ushr v31.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v29.16b, v2.16b, v10.16b + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v26.4s, v26.4s, v28.4s + orr v3.16b, v3.16b, v31.16b + ushr v31.4s, v29.4s, #12 + shl v29.4s, v29.4s, #20 + eor v16.16b, v26.16b, v16.16b + add v0.4s, v0.4s, v4.4s + add v17.4s, v17.4s, v12.4s + orr v29.16b, v29.16b, v31.16b + eor v24.16b, v0.16b, v24.16b + add v0.4s, v0.4s, v22.4s + add v20.4s, v20.4s, v3.4s + ushr v22.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + add v23.4s, v26.4s, v21.4s + eor v21.16b, v20.16b, v27.16b + add v17.4s, v17.4s, v29.4s + orr v16.16b, v16.16b, v22.16b + ushr v22.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v25.16b, v17.16b, v25.16b + orr v22.16b, v24.16b, v22.16b + ushr v24.4s, v21.4s, #8 + shl v21.4s, v21.4s, #24 + orr v21.16b, v21.16b, v24.16b + ushr v24.4s, v25.4s, #8 + shl v25.4s, v25.4s, #24 + add v1.4s, v16.4s, v1.4s + orr v24.16b, v25.16b, v24.16b + add v5.4s, v22.4s, v5.4s + eor v25.16b, v1.16b, v28.16b + add v6.4s, v21.4s, v6.4s + eor v4.16b, v5.16b, v4.16b + ushr v27.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + add v2.4s, v24.4s, v2.4s + eor v3.16b, v6.16b, v3.16b + orr v25.16b, v25.16b, v27.16b + ushr v27.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + ldur q19, [x29, #-176] + eor v26.16b, v2.16b, v29.16b + orr v4.16b, v4.16b, v27.16b + ushr v27.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + orr v3.16b, v3.16b, v27.16b + ushr v27.4s, v26.4s, #7 + shl v26.4s, v26.4s, #25 + add v20.4s, v20.4s, v18.4s + add v17.4s, v17.4s, v30.4s + orr v26.16b, v26.16b, v27.16b + add v0.4s, v0.4s, v3.4s + eor v16.16b, v0.16b, v16.16b + add v0.4s, v0.4s, v19.4s + add v19.4s, v20.4s, v26.4s + add v17.4s, v17.4s, v25.4s + eor v20.16b, v19.16b, v22.16b + add v7.4s, v19.4s, v7.4s + eor v19.16b, v17.16b, v21.16b + ldur q21, [x29, #-64] + add v23.4s, v23.4s, v4.4s + eor v24.16b, v23.16b, v24.16b + rev32 v16.8h, v16.8h + add v17.4s, v17.4s, v21.4s + rev32 v21.8h, v24.8h + add v6.4s, v6.4s, v21.4s + rev32 v20.8h, v20.8h + add v2.4s, v2.4s, v16.4s + eor v4.16b, v6.16b, v4.16b + rev32 v19.8h, v19.8h + add v1.4s, v1.4s, v20.4s + eor v3.16b, v2.16b, v3.16b + ushr v24.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v5.4s, v5.4s, v19.4s + eor v22.16b, v1.16b, v26.16b + orr v4.16b, v4.16b, v24.16b + ushr v24.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v18.4s, v23.4s, v14.4s + eor v23.16b, v5.16b, v25.16b + orr v3.16b, v3.16b, v24.16b + ushr v24.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + orr v22.16b, v22.16b, v24.16b + ushr v24.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v24.16b + add v18.4s, v18.4s, v4.4s + add v0.4s, v0.4s, v3.4s + add v24.4s, v17.4s, v23.4s + eor v17.16b, v18.16b, v21.16b + add v7.4s, v7.4s, v22.4s + eor v16.16b, v0.16b, v16.16b + ushr v21.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + eor v20.16b, v7.16b, v20.16b + orr v21.16b, v17.16b, v21.16b + ushr v17.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v19.16b, v24.16b, v19.16b + orr v16.16b, v16.16b, v17.16b + ushr v17.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + orr v25.16b, v20.16b, v17.16b + ushr v17.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v17.16b + add v1.4s, v25.4s, v1.4s + eor v22.16b, v1.16b, v22.16b + eor v20.16b, v1.16b, v18.16b + add v1.4s, v19.4s, v5.4s + eor v26.16b, v1.16b, v0.16b + add v0.4s, v21.4s, v6.4s + eor v5.16b, v1.16b, v23.16b + eor v1.16b, v0.16b, v4.16b + eor v17.16b, v0.16b, v7.16b + add v0.4s, v16.4s, v2.4s + eor v2.16b, v0.16b, v3.16b + eor v6.16b, v0.16b, v24.16b + ushr v0.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v0.16b, v1.16b, v0.16b + ushr v1.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v2.16b, v1.16b + ushr v2.4s, v22.4s, #7 + shl v3.4s, v22.4s, #25 + orr v2.16b, v3.16b, v2.16b + ushr v3.4s, v5.4s, #7 + shl v4.4s, v5.4s, #25 + orr v3.16b, v4.16b, v3.16b + eor v8.16b, v16.16b, v3.16b + eor v9.16b, v25.16b, v0.16b + eor v31.16b, v1.16b, v19.16b + cmp x17, x22 + eor v15.16b, v2.16b, v21.16b + mov w18, w19 + b.ne .LBB2_4 +.LBB2_7: + zip1 v0.4s, v20.4s, v26.4s + zip2 v1.4s, v20.4s, v26.4s + zip1 v2.4s, v17.4s, v6.4s + zip2 v3.4s, v17.4s, v6.4s + zip1 v4.4s, v8.4s, v9.4s + zip2 v5.4s, v8.4s, v9.4s + zip1 v6.4s, v31.4s, v15.4s + zip2 v7.4s, v31.4s, v15.4s + add x13, x20, #4 + tst w5, #0x1 + sub x28, x28, #4 + zip1 v16.2d, v0.2d, v2.2d + zip2 v0.2d, v0.2d, v2.2d + zip1 v2.2d, v1.2d, v3.2d + zip2 v1.2d, v1.2d, v3.2d + zip1 v3.2d, v4.2d, v6.2d + zip2 v4.2d, v4.2d, v6.2d + zip1 v6.2d, v5.2d, v7.2d + zip2 v5.2d, v5.2d, v7.2d + add x24, x24, #32 + csel x20, x13, x20, ne + cmp x28, #3 + stp q16, q3, [x26] + stp q0, q4, [x26, #32] + stp q2, q6, [x26, #64] + stp q1, q5, [x26, #96] + add x26, x26, #128 + b.hi .LBB2_2 +.LBB2_8: + cbz x28, .LBB2_16 + orr w8, w7, w19 + and x21, x5, #0x1 + stur w8, [x29, #-64] +.LBB2_10: + ldr x8, [sp, #40] + ldr x25, [x24] + ldur w4, [x29, #-64] + ldp q1, q0, [x8] + mov x8, x22 + stp q1, q0, [x29, #-48] +.LBB2_11: + subs x23, x8, #1 + b.eq .LBB2_13 + cbnz x8, .LBB2_14 + b .LBB2_15 +.LBB2_13: + orr w4, w4, w27 +.LBB2_14: + sub x0, x29, #48 + mov w2, #64 + mov x1, x25 + mov x3, x20 + bl zfs_blake3_compress_in_place_sse2 + add x25, x25, #64 + mov x8, x23 + mov w4, w19 + b .LBB2_11 +.LBB2_15: + ldp q0, q1, [x29, #-48] + add x20, x20, x21 + add x24, x24, #8 + subs x28, x28, #1 + stp q0, q1, [x26], #32 + b.ne .LBB2_10 +.LBB2_16: + add sp, sp, #384 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] + ldp d9, d8, [sp, #48] + ldp d11, d10, [sp, #32] + ldp d13, d12, [sp, #16] + ldp d15, d14, [sp], #160 + ret +.Lfunc_end2: + .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S new file mode 100644 index 000000000..eb6946400 --- /dev/null +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -0,0 +1,2463 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE4.1 -> ARMv8-A + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if defined(__aarch64__) + .text + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI0_1: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI0_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI0_3: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LCPI0_4: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: + .cfi_startproc + ldp q7, q6, [x0] + ldp q17, q18, [x1] + add x12, x1, #32 + ld2 { v4.4s, v5.4s }, [x12] + lsr x10, x3, #32 + fmov s16, w3 + adrp x13, .LCPI0_0 + adrp x11, .LCPI0_1 + and w8, w2, #0xff + mov v16.s[1], w10 + ldr q0, [x13, :lo12:.LCPI0_0] + ldr q20, [x11, :lo12:.LCPI0_1] + adrp x11, .LCPI0_4 + and w9, w4, #0xff + ldr q2, [x11, :lo12:.LCPI0_4] + mov v16.s[2], w8 + uzp1 v21.4s, v17.4s, v18.4s + add v7.4s, v6.4s, v7.4s + adrp x12, .LCPI0_3 + mov v16.s[3], w9 + uzp2 v18.4s, v17.4s, v18.4s + add v7.4s, v7.4s, v21.4s + ext v17.16b, v5.16b, v5.16b, #12 + ldr q3, [x12, :lo12:.LCPI0_3] + ext v24.16b, v4.16b, v4.16b, #12 + eor v16.16b, v7.16b, v16.16b + mov v27.16b, v17.16b + uzp1 v19.4s, v21.4s, v21.4s + ext v25.16b, v21.16b, v21.16b, #12 + zip2 v28.4s, v18.4s, v17.4s + tbl v29.16b, { v16.16b }, v0.16b + mov v27.s[1], v24.s[2] + zip1 v23.2d, v17.2d, v18.2d + ext v19.16b, v19.16b, v21.16b, #8 + add v22.4s, v29.4s, v20.4s + ext v26.16b, v21.16b, v25.16b, #12 + tbl v20.16b, { v23.16b, v24.16b }, v2.16b + zip1 v21.4s, v28.4s, v24.4s + zip1 v23.4s, v24.4s, v28.4s + uzp2 v19.4s, v19.4s, v18.4s + eor v24.16b, v22.16b, v6.16b + ext v25.16b, v20.16b, v20.16b, #12 + ext v6.16b, v23.16b, v21.16b, #8 + add v7.4s, v7.4s, v18.4s + ext v18.16b, v19.16b, v19.16b, #4 + tbl v16.16b, { v26.16b, v27.16b }, v3.16b + uzp1 v21.4s, v20.4s, v25.4s + mov v26.16b, v6.16b + ext v23.16b, v18.16b, v18.16b, #12 + mov v26.s[1], v21.s[2] + adrp x10, .LCPI0_2 + ext v25.16b, v18.16b, v23.16b, #12 + uzp1 v23.4s, v18.4s, v18.4s + ldr q1, [x10, :lo12:.LCPI0_2] + ext v18.16b, v23.16b, v18.16b, #8 + ushr v23.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + orr v23.16b, v24.16b, v23.16b + add v7.4s, v7.4s, v23.4s + eor v27.16b, v29.16b, v7.16b + add v4.4s, v7.4s, v4.4s + tbl v7.16b, { v25.16b, v26.16b }, v3.16b + tbl v26.16b, { v27.16b }, v1.16b + add v22.4s, v22.4s, v26.4s + uzp2 v18.4s, v18.4s, v16.4s + eor v23.16b, v23.16b, v22.16b + ext v5.16b, v18.16b, v18.16b, #4 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + uzp1 v25.4s, v5.4s, v5.4s + orr v23.16b, v23.16b, v27.16b + ext v28.16b, v4.16b, v4.16b, #12 + ext v4.16b, v25.16b, v5.16b, #8 + ext v25.16b, v26.16b, v26.16b, #8 + add v26.4s, v28.4s, v23.4s + eor v25.16b, v26.16b, v25.16b + ext v22.16b, v22.16b, v22.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v22.4s, v22.4s, v25.4s + eor v23.16b, v23.16b, v22.16b + add v17.4s, v26.4s, v17.4s + ushr v26.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v26.16b + add v17.4s, v17.4s, v23.4s + eor v25.16b, v25.16b, v17.16b + add v17.4s, v17.4s, v19.4s + tbl v19.16b, { v25.16b }, v1.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ext v17.16b, v17.16b, v17.16b, #4 + orr v23.16b, v23.16b, v25.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v17.4s, v17.4s, v23.4s + eor v19.16b, v17.16b, v19.16b + ext v22.16b, v22.16b, v22.16b, #12 + tbl v19.16b, { v19.16b }, v0.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v17.4s, v17.4s, v16.4s + orr v23.16b, v23.16b, v25.16b + add v17.4s, v17.4s, v23.4s + ext v25.16b, v17.16b, v17.16b, #12 + eor v17.16b, v19.16b, v17.16b + tbl v17.16b, { v17.16b }, v1.16b + add v19.4s, v22.4s, v17.4s + eor v22.16b, v23.16b, v19.16b + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v6.2d, v16.2d + ushr v23.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + zip2 v24.4s, v16.4s, v6.4s + tbl v26.16b, { v20.16b, v21.16b }, v2.16b + orr v22.16b, v22.16b, v23.16b + zip1 v16.4s, v24.4s, v21.4s + zip1 v20.4s, v21.4s, v24.4s + ext v21.16b, v26.16b, v26.16b, #12 + ext v17.16b, v17.16b, v17.16b, #8 + add v25.4s, v25.4s, v22.4s + ext v16.16b, v20.16b, v16.16b, #8 + uzp1 v21.4s, v26.4s, v21.4s + eor v26.16b, v25.16b, v17.16b + ext v19.16b, v19.16b, v19.16b, #4 + tbl v26.16b, { v26.16b }, v0.16b + mov v29.16b, v16.16b + add v19.4s, v19.4s, v26.4s + ext v27.16b, v5.16b, v5.16b, #12 + mov v29.s[1], v21.s[2] + eor v22.16b, v22.16b, v19.16b + ext v28.16b, v5.16b, v27.16b, #12 + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v25.4s, v6.4s + orr v22.16b, v22.16b, v27.16b + add v6.4s, v6.4s, v22.4s + eor v26.16b, v26.16b, v6.16b + add v6.4s, v6.4s, v18.4s + tbl v18.16b, { v26.16b }, v1.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v22.16b, v22.16b, v26.16b + ext v18.16b, v18.16b, v18.16b, #8 + add v6.4s, v6.4s, v22.4s + eor v18.16b, v6.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v18.16b, { v18.16b }, v0.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v6.4s, v7.4s + orr v22.16b, v22.16b, v26.16b + add v6.4s, v6.4s, v22.4s + ext v26.16b, v6.16b, v6.16b, #12 + eor v6.16b, v18.16b, v6.16b + uzp2 v4.4s, v4.4s, v7.4s + zip2 v25.4s, v7.4s, v16.4s + add v26.4s, v26.4s, v21.4s + zip1 v20.2d, v16.2d, v7.2d + tbl v6.16b, { v6.16b }, v1.16b + ext v24.16b, v4.16b, v4.16b, #4 + tbl v27.16b, { v20.16b, v21.16b }, v2.16b + zip1 v7.4s, v25.4s, v21.4s + zip1 v20.4s, v21.4s, v25.4s + add v18.4s, v19.4s, v6.4s + uzp1 v5.4s, v24.4s, v24.4s + ext v21.16b, v27.16b, v27.16b, #12 + ext v7.16b, v20.16b, v7.16b, #8 + eor v19.16b, v22.16b, v18.16b + ext v5.16b, v5.16b, v24.16b, #8 + tbl v17.16b, { v28.16b, v29.16b }, v3.16b + uzp1 v21.4s, v27.4s, v21.4s + mov v28.16b, v7.16b + ushr v22.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v23.16b, v24.16b, v24.16b, #12 + uzp2 v5.4s, v5.4s, v17.4s + mov v28.s[1], v21.s[2] + orr v19.16b, v19.16b, v22.16b + ext v27.16b, v24.16b, v23.16b, #12 + ext v23.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #8 + ext v25.16b, v18.16b, v18.16b, #4 + add v18.4s, v26.4s, v19.4s + uzp1 v24.4s, v23.4s, v23.4s + eor v6.16b, v18.16b, v6.16b + ext v24.16b, v24.16b, v23.16b, #8 + add v16.4s, v18.4s, v16.4s + tbl v18.16b, { v27.16b, v28.16b }, v3.16b + tbl v27.16b, { v6.16b }, v0.16b + uzp2 v6.4s, v24.4s, v18.4s + add v24.4s, v25.4s, v27.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v25.16b + add v16.4s, v16.4s, v19.4s + eor v25.16b, v27.16b, v16.16b + add v4.4s, v16.4s, v4.4s + tbl v16.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v16.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v16.16b, v16.16b, v16.16b, #8 + add v4.4s, v4.4s, v19.4s + eor v16.16b, v4.16b, v16.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v25.16b, { v16.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v16.16b, v19.16b, v24.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v17.4s + orr v19.16b, v16.16b, v19.16b + add v27.4s, v4.4s, v19.4s + eor v25.16b, v25.16b, v27.16b + tbl v25.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v25.4s + zip2 v26.4s, v17.4s, v7.4s + ext v4.16b, v27.16b, v27.16b, #12 + eor v19.16b, v19.16b, v24.16b + add v28.4s, v4.4s, v21.4s + zip1 v20.2d, v7.2d, v17.2d + zip1 v4.4s, v26.4s, v21.4s + zip1 v17.4s, v21.4s, v26.4s + ushr v26.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v26.16b + ext v25.16b, v25.16b, v25.16b, #8 + add v27.4s, v28.4s, v19.4s + eor v25.16b, v27.16b, v25.16b + ext v24.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v19.16b, v19.16b, v24.16b + add v7.4s, v27.4s, v7.4s + ushr v27.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v27.16b + add v7.4s, v7.4s, v19.4s + eor v25.16b, v25.16b, v7.16b + add v5.4s, v7.4s, v5.4s + tbl v7.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v7.16b, v7.16b, v7.16b, #8 + add v5.4s, v5.4s, v19.4s + eor v7.16b, v5.16b, v7.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v7.16b, { v7.16b }, v0.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + tbl v16.16b, { v20.16b, v21.16b }, v2.16b + add v5.4s, v5.4s, v18.4s + orr v19.16b, v19.16b, v25.16b + ext v20.16b, v16.16b, v16.16b, #12 + ext v4.16b, v17.16b, v4.16b, #8 + add v5.4s, v5.4s, v19.4s + uzp1 v21.4s, v16.4s, v20.4s + mov v17.16b, v4.16b + ext v25.16b, v5.16b, v5.16b, #12 + mov v17.s[1], v21.s[2] + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v4.2d, v18.2d + ext v22.16b, v23.16b, v23.16b, #12 + zip2 v26.4s, v18.4s, v4.4s + tbl v18.16b, { v20.16b, v21.16b }, v2.16b + eor v5.16b, v7.16b, v5.16b + ext v16.16b, v23.16b, v22.16b, #12 + ext v22.16b, v6.16b, v6.16b, #4 + zip1 v27.4s, v26.4s, v21.4s + zip1 v20.4s, v21.4s, v26.4s + ext v21.16b, v18.16b, v18.16b, #12 + tbl v5.16b, { v5.16b }, v1.16b + ext v20.16b, v20.16b, v27.16b, #8 + uzp1 v27.4s, v18.4s, v21.4s + uzp1 v18.4s, v22.4s, v22.4s + add v21.4s, v24.4s, v5.4s + ext v18.16b, v18.16b, v22.16b, #8 + eor v19.16b, v19.16b, v21.16b + tbl v7.16b, { v16.16b, v17.16b }, v3.16b + uzp2 v18.4s, v18.4s, v17.4s + zip2 v16.4s, v16.4s, v20.4s + ushr v17.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v17.16b, v19.16b, v17.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v19.4s, v25.4s, v17.4s + eor v5.16b, v19.16b, v5.16b + ext v21.16b, v21.16b, v21.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v4.4s, v19.4s, v4.4s + add v19.4s, v21.4s, v5.4s + eor v17.16b, v17.16b, v19.16b + ushr v21.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v21.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v4.4s, v4.4s, v6.4s + add v6.4s, v19.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v17.16b, v19.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v17.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v7.4s + orr v17.16b, v17.16b, v19.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + mov v29.16b, v20.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v29.s[1], v27.s[2] + add v4.4s, v4.4s, v27.4s + zip1 v26.2d, v20.2d, v7.2d + zip1 v7.4s, v16.4s, v27.4s + zip1 v16.4s, v27.4s, v16.4s + eor v17.16b, v17.16b, v6.16b + ext v7.16b, v16.16b, v7.16b, #8 + ushr v16.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v16.16b, v17.16b, v16.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + ushr v17.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v20.4s + orr v16.16b, v16.16b, v17.16b + add v4.4s, v4.4s, v16.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + add v4.4s, v4.4s, v18.4s + ushr v17.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + ext v23.16b, v22.16b, v22.16b, #12 + ext v4.16b, v4.16b, v4.16b, #4 + orr v16.16b, v16.16b, v17.16b + ext v28.16b, v22.16b, v23.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v16.4s, v4.4s + tbl v3.16b, { v28.16b, v29.16b }, v3.16b + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v4.4s, v3.4s + tbl v4.16b, { v5.16b }, v0.16b + add v5.4s, v6.4s, v4.4s + eor v6.16b, v16.16b, v5.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v16.16b + tbl v2.16b, { v26.16b, v27.16b }, v2.16b + add v3.4s, v3.4s, v6.4s + ext v19.16b, v2.16b, v2.16b, #12 + eor v4.16b, v4.16b, v3.16b + uzp1 v2.4s, v2.4s, v19.4s + ext v3.16b, v3.16b, v3.16b, #12 + tbl v4.16b, { v4.16b }, v1.16b + add v2.4s, v3.4s, v2.4s + add v3.4s, v5.4s, v4.4s + eor v5.16b, v6.16b, v3.16b + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v6.16b + ext v4.16b, v4.16b, v4.16b, #8 + add v2.4s, v2.4s, v5.4s + eor v4.16b, v2.16b, v4.16b + ext v3.16b, v3.16b, v3.16b, #4 + tbl v0.16b, { v4.16b }, v0.16b + add v3.4s, v3.4s, v0.4s + eor v4.16b, v5.16b, v3.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v2.4s, v2.4s, v7.4s + orr v4.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v4.4s + eor v0.16b, v0.16b, v2.16b + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v3.4s, v0.4s + eor v3.16b, v4.16b, v1.16b + ext v2.16b, v2.16b, v2.16b, #4 + ext v1.16b, v1.16b, v1.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v0.16b, v0.16b, v0.16b, #8 + eor v1.16b, v2.16b, v1.16b + orr v2.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + stp q1, q0, [x0] + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI1_1: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI1_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI1_3: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LCPI1_4: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 + .text + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: + .cfi_startproc + ldp q7, q6, [x0] + ldp q17, q18, [x1] + add x12, x1, #32 + ld2 { v4.4s, v5.4s }, [x12] + lsr x10, x3, #32 + fmov s16, w3 + adrp x13, .LCPI1_0 + adrp x11, .LCPI1_1 + and w8, w2, #0xff + mov v16.s[1], w10 + ldr q0, [x13, :lo12:.LCPI1_0] + ldr q20, [x11, :lo12:.LCPI1_1] + adrp x11, .LCPI1_4 + and w9, w4, #0xff + ldr q2, [x11, :lo12:.LCPI1_4] + mov v16.s[2], w8 + uzp1 v21.4s, v17.4s, v18.4s + add v7.4s, v6.4s, v7.4s + adrp x12, .LCPI1_3 + mov v16.s[3], w9 + uzp2 v18.4s, v17.4s, v18.4s + add v7.4s, v7.4s, v21.4s + ext v17.16b, v5.16b, v5.16b, #12 + ldr q3, [x12, :lo12:.LCPI1_3] + ext v24.16b, v4.16b, v4.16b, #12 + eor v16.16b, v7.16b, v16.16b + mov v27.16b, v17.16b + uzp1 v19.4s, v21.4s, v21.4s + ext v25.16b, v21.16b, v21.16b, #12 + zip2 v28.4s, v18.4s, v17.4s + tbl v29.16b, { v16.16b }, v0.16b + mov v27.s[1], v24.s[2] + zip1 v23.2d, v17.2d, v18.2d + ext v19.16b, v19.16b, v21.16b, #8 + add v22.4s, v29.4s, v20.4s + ext v26.16b, v21.16b, v25.16b, #12 + tbl v20.16b, { v23.16b, v24.16b }, v2.16b + zip1 v21.4s, v28.4s, v24.4s + zip1 v23.4s, v24.4s, v28.4s + uzp2 v19.4s, v19.4s, v18.4s + eor v24.16b, v22.16b, v6.16b + ext v25.16b, v20.16b, v20.16b, #12 + ext v6.16b, v23.16b, v21.16b, #8 + add v7.4s, v7.4s, v18.4s + ext v18.16b, v19.16b, v19.16b, #4 + tbl v16.16b, { v26.16b, v27.16b }, v3.16b + uzp1 v21.4s, v20.4s, v25.4s + mov v26.16b, v6.16b + ext v23.16b, v18.16b, v18.16b, #12 + mov v26.s[1], v21.s[2] + adrp x10, .LCPI1_2 + ext v25.16b, v18.16b, v23.16b, #12 + uzp1 v23.4s, v18.4s, v18.4s + ldr q1, [x10, :lo12:.LCPI1_2] + ext v18.16b, v23.16b, v18.16b, #8 + ushr v23.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + orr v23.16b, v24.16b, v23.16b + add v7.4s, v7.4s, v23.4s + eor v27.16b, v29.16b, v7.16b + add v4.4s, v7.4s, v4.4s + tbl v7.16b, { v25.16b, v26.16b }, v3.16b + tbl v26.16b, { v27.16b }, v1.16b + add v22.4s, v22.4s, v26.4s + uzp2 v18.4s, v18.4s, v16.4s + eor v23.16b, v23.16b, v22.16b + ext v5.16b, v18.16b, v18.16b, #4 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + uzp1 v25.4s, v5.4s, v5.4s + orr v23.16b, v23.16b, v27.16b + ext v28.16b, v4.16b, v4.16b, #12 + ext v4.16b, v25.16b, v5.16b, #8 + ext v25.16b, v26.16b, v26.16b, #8 + add v26.4s, v28.4s, v23.4s + eor v25.16b, v26.16b, v25.16b + ext v22.16b, v22.16b, v22.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v22.4s, v22.4s, v25.4s + eor v23.16b, v23.16b, v22.16b + add v17.4s, v26.4s, v17.4s + ushr v26.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v26.16b + add v17.4s, v17.4s, v23.4s + eor v25.16b, v25.16b, v17.16b + add v17.4s, v17.4s, v19.4s + tbl v19.16b, { v25.16b }, v1.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ext v17.16b, v17.16b, v17.16b, #4 + orr v23.16b, v23.16b, v25.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v17.4s, v17.4s, v23.4s + eor v19.16b, v17.16b, v19.16b + ext v22.16b, v22.16b, v22.16b, #12 + tbl v19.16b, { v19.16b }, v0.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v17.4s, v17.4s, v16.4s + orr v23.16b, v23.16b, v25.16b + add v17.4s, v17.4s, v23.4s + ext v25.16b, v17.16b, v17.16b, #12 + eor v17.16b, v19.16b, v17.16b + tbl v17.16b, { v17.16b }, v1.16b + add v19.4s, v22.4s, v17.4s + eor v22.16b, v23.16b, v19.16b + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v6.2d, v16.2d + ushr v23.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + zip2 v24.4s, v16.4s, v6.4s + tbl v26.16b, { v20.16b, v21.16b }, v2.16b + orr v22.16b, v22.16b, v23.16b + zip1 v16.4s, v24.4s, v21.4s + zip1 v20.4s, v21.4s, v24.4s + ext v21.16b, v26.16b, v26.16b, #12 + ext v17.16b, v17.16b, v17.16b, #8 + add v25.4s, v25.4s, v22.4s + ext v16.16b, v20.16b, v16.16b, #8 + uzp1 v21.4s, v26.4s, v21.4s + eor v26.16b, v25.16b, v17.16b + ext v19.16b, v19.16b, v19.16b, #4 + tbl v26.16b, { v26.16b }, v0.16b + mov v29.16b, v16.16b + add v19.4s, v19.4s, v26.4s + ext v27.16b, v5.16b, v5.16b, #12 + mov v29.s[1], v21.s[2] + eor v22.16b, v22.16b, v19.16b + ext v28.16b, v5.16b, v27.16b, #12 + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v25.4s, v6.4s + orr v22.16b, v22.16b, v27.16b + add v6.4s, v6.4s, v22.4s + eor v26.16b, v26.16b, v6.16b + add v6.4s, v6.4s, v18.4s + tbl v18.16b, { v26.16b }, v1.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v22.16b, v22.16b, v26.16b + ext v18.16b, v18.16b, v18.16b, #8 + add v6.4s, v6.4s, v22.4s + eor v18.16b, v6.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v18.16b, { v18.16b }, v0.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v6.4s, v7.4s + orr v22.16b, v22.16b, v26.16b + add v6.4s, v6.4s, v22.4s + ext v26.16b, v6.16b, v6.16b, #12 + eor v6.16b, v18.16b, v6.16b + uzp2 v4.4s, v4.4s, v7.4s + zip2 v25.4s, v7.4s, v16.4s + add v26.4s, v26.4s, v21.4s + zip1 v20.2d, v16.2d, v7.2d + tbl v6.16b, { v6.16b }, v1.16b + ext v24.16b, v4.16b, v4.16b, #4 + tbl v27.16b, { v20.16b, v21.16b }, v2.16b + zip1 v7.4s, v25.4s, v21.4s + zip1 v20.4s, v21.4s, v25.4s + add v18.4s, v19.4s, v6.4s + uzp1 v5.4s, v24.4s, v24.4s + ext v21.16b, v27.16b, v27.16b, #12 + ext v7.16b, v20.16b, v7.16b, #8 + eor v19.16b, v22.16b, v18.16b + ext v5.16b, v5.16b, v24.16b, #8 + tbl v17.16b, { v28.16b, v29.16b }, v3.16b + uzp1 v21.4s, v27.4s, v21.4s + mov v28.16b, v7.16b + ushr v22.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v23.16b, v24.16b, v24.16b, #12 + uzp2 v5.4s, v5.4s, v17.4s + mov v28.s[1], v21.s[2] + orr v19.16b, v19.16b, v22.16b + ext v27.16b, v24.16b, v23.16b, #12 + ext v23.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #8 + ext v25.16b, v18.16b, v18.16b, #4 + add v18.4s, v26.4s, v19.4s + uzp1 v24.4s, v23.4s, v23.4s + eor v6.16b, v18.16b, v6.16b + ext v24.16b, v24.16b, v23.16b, #8 + add v16.4s, v18.4s, v16.4s + tbl v18.16b, { v27.16b, v28.16b }, v3.16b + tbl v27.16b, { v6.16b }, v0.16b + uzp2 v6.4s, v24.4s, v18.4s + add v24.4s, v25.4s, v27.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v25.16b + add v16.4s, v16.4s, v19.4s + eor v25.16b, v27.16b, v16.16b + add v4.4s, v16.4s, v4.4s + tbl v16.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v16.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v16.16b, v16.16b, v16.16b, #8 + add v4.4s, v4.4s, v19.4s + eor v16.16b, v4.16b, v16.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v25.16b, { v16.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v16.16b, v19.16b, v24.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v17.4s + orr v19.16b, v16.16b, v19.16b + add v27.4s, v4.4s, v19.4s + eor v25.16b, v25.16b, v27.16b + tbl v25.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v25.4s + zip2 v26.4s, v17.4s, v7.4s + ext v4.16b, v27.16b, v27.16b, #12 + eor v19.16b, v19.16b, v24.16b + add v28.4s, v4.4s, v21.4s + zip1 v20.2d, v7.2d, v17.2d + zip1 v4.4s, v26.4s, v21.4s + zip1 v17.4s, v21.4s, v26.4s + ushr v26.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v26.16b + ext v25.16b, v25.16b, v25.16b, #8 + add v27.4s, v28.4s, v19.4s + eor v25.16b, v27.16b, v25.16b + ext v24.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v19.16b, v19.16b, v24.16b + add v7.4s, v27.4s, v7.4s + ushr v27.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v27.16b + add v7.4s, v7.4s, v19.4s + eor v25.16b, v25.16b, v7.16b + add v5.4s, v7.4s, v5.4s + tbl v7.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v7.16b, v7.16b, v7.16b, #8 + add v5.4s, v5.4s, v19.4s + eor v7.16b, v5.16b, v7.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v7.16b, { v7.16b }, v0.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + tbl v16.16b, { v20.16b, v21.16b }, v2.16b + add v5.4s, v5.4s, v18.4s + orr v19.16b, v19.16b, v25.16b + ext v20.16b, v16.16b, v16.16b, #12 + ext v4.16b, v17.16b, v4.16b, #8 + add v5.4s, v5.4s, v19.4s + uzp1 v21.4s, v16.4s, v20.4s + mov v17.16b, v4.16b + ext v25.16b, v5.16b, v5.16b, #12 + mov v17.s[1], v21.s[2] + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v4.2d, v18.2d + ext v22.16b, v23.16b, v23.16b, #12 + zip2 v26.4s, v18.4s, v4.4s + tbl v18.16b, { v20.16b, v21.16b }, v2.16b + eor v5.16b, v7.16b, v5.16b + ext v16.16b, v23.16b, v22.16b, #12 + ext v22.16b, v6.16b, v6.16b, #4 + zip1 v27.4s, v26.4s, v21.4s + zip1 v20.4s, v21.4s, v26.4s + ext v21.16b, v18.16b, v18.16b, #12 + tbl v5.16b, { v5.16b }, v1.16b + ext v20.16b, v20.16b, v27.16b, #8 + uzp1 v27.4s, v18.4s, v21.4s + uzp1 v18.4s, v22.4s, v22.4s + add v21.4s, v24.4s, v5.4s + ext v18.16b, v18.16b, v22.16b, #8 + eor v19.16b, v19.16b, v21.16b + tbl v7.16b, { v16.16b, v17.16b }, v3.16b + uzp2 v18.4s, v18.4s, v17.4s + zip2 v16.4s, v16.4s, v20.4s + ushr v17.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v17.16b, v19.16b, v17.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v19.4s, v25.4s, v17.4s + eor v5.16b, v19.16b, v5.16b + ext v21.16b, v21.16b, v21.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v4.4s, v19.4s, v4.4s + add v19.4s, v21.4s, v5.4s + eor v17.16b, v17.16b, v19.16b + ushr v21.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v21.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v4.4s, v4.4s, v6.4s + add v6.4s, v19.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v17.16b, v19.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v17.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v7.4s + orr v17.16b, v17.16b, v19.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + mov v29.16b, v20.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v29.s[1], v27.s[2] + add v4.4s, v4.4s, v27.4s + zip1 v26.2d, v20.2d, v7.2d + zip1 v7.4s, v16.4s, v27.4s + zip1 v16.4s, v27.4s, v16.4s + eor v17.16b, v17.16b, v6.16b + ext v7.16b, v16.16b, v7.16b, #8 + ushr v16.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v16.16b, v17.16b, v16.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + ushr v17.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v20.4s + orr v16.16b, v16.16b, v17.16b + add v4.4s, v4.4s, v16.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + add v4.4s, v4.4s, v18.4s + ushr v17.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + ext v23.16b, v22.16b, v22.16b, #12 + ext v4.16b, v4.16b, v4.16b, #4 + orr v16.16b, v16.16b, v17.16b + ext v28.16b, v22.16b, v23.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v16.4s, v4.4s + tbl v3.16b, { v28.16b, v29.16b }, v3.16b + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v4.4s, v3.4s + tbl v4.16b, { v5.16b }, v0.16b + add v5.4s, v6.4s, v4.4s + eor v6.16b, v16.16b, v5.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v16.16b + tbl v2.16b, { v26.16b, v27.16b }, v2.16b + add v3.4s, v3.4s, v6.4s + ext v19.16b, v2.16b, v2.16b, #12 + eor v4.16b, v4.16b, v3.16b + uzp1 v2.4s, v2.4s, v19.4s + ext v3.16b, v3.16b, v3.16b, #12 + tbl v4.16b, { v4.16b }, v1.16b + add v2.4s, v3.4s, v2.4s + add v3.4s, v5.4s, v4.4s + eor v5.16b, v6.16b, v3.16b + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v6.16b + ext v4.16b, v4.16b, v4.16b, #8 + add v2.4s, v2.4s, v5.4s + eor v4.16b, v2.16b, v4.16b + ext v3.16b, v3.16b, v3.16b, #4 + tbl v0.16b, { v4.16b }, v0.16b + add v3.4s, v3.4s, v0.4s + eor v4.16b, v5.16b, v3.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v2.4s, v2.4s, v7.4s + orr v4.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v4.4s + eor v0.16b, v0.16b, v2.16b + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v3.4s, v0.4s + eor v3.16b, v4.16b, v1.16b + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + ext v0.16b, v0.16b, v0.16b, #8 + ext v1.16b, v1.16b, v1.16b, #12 + orr v3.16b, v3.16b, v4.16b + eor v2.16b, v2.16b, v1.16b + eor v3.16b, v3.16b, v0.16b + stp q2, q3, [x5] + ldr q2, [x0] + eor v1.16b, v2.16b, v1.16b + str q1, [x5, #32] + ldr q1, [x0, #16] + eor v0.16b, v1.16b, v0.16b + str q0, [x5, #48] + ret +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_0: + .word 0 + .word 1 + .word 2 + .word 3 +.LCPI2_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI2_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 + .text + .globl zfs_blake3_hash_many_sse41 + .p2align 2 + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: + .cfi_startproc + stp d15, d14, [sp, #-160]! + stp d13, d12, [sp, #16] + stp d11, d10, [sp, #32] + stp d9, d8, [sp, #48] + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + mov x29, sp + sub sp, sp, #448 + .cfi_def_cfa w29, 160 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + ldr x26, [x29, #168] + ldrb w27, [x29, #160] + mov w19, w6 + mov x20, x4 + mov x22, x2 + mov x28, x1 + cmp x1, #4 + mov x24, x0 + str x3, [sp, #40] + b.lo .LBB2_8 + adrp x11, .LCPI2_0 + ldr q0, [x11, :lo12:.LCPI2_0] + sbfx w13, w5, #0, #1 + dup v1.4s, w13 + mov w10, #58983 + mov w11, #44677 + mov w12, #62322 + and v0.16b, v1.16b, v0.16b + mov w13, #62778 + orr w8, w7, w19 + adrp x9, .LCPI2_1 + movk w10, #27145, lsl #16 + movk w11, #47975, lsl #16 + movk w12, #15470, lsl #16 + movk w13, #42319, lsl #16 + str q0, [sp, #16] + orr v0.4s, #128, lsl #24 + adrp x14, .LCPI2_2 + str q0, [sp] +.LBB2_2: + ldr x2, [sp, #40] + mov x15, x2 + ld1r { v7.4s }, [x15], #4 + add x16, x2, #8 + add x17, x2, #12 + add x18, x2, #16 + add x0, x2, #20 + add x3, x2, #24 + add x2, x2, #28 + ld1r { v6.4s }, [x16] + ld1r { v17.4s }, [x17] + ld1r { v10.4s }, [x18] + ld1r { v11.4s }, [x0] + ld1r { v19.4s }, [x3] + ld1r { v18.4s }, [x15] + ld1r { v16.4s }, [x2] + cbz x22, .LBB2_7 + ldr q1, [sp, #16] + dup v0.4s, w20 + ldp x15, x16, [x24] + ldp x17, x18, [x24, #16] + add v1.4s, v0.4s, v1.4s + movi v0.4s, #128, lsl #24 + str q1, [sp, #64] + eor v0.16b, v1.16b, v0.16b + ldr q1, [sp] + lsr x2, x20, #32 + mov x0, xzr + mov w6, w8 + cmgt v0.4s, v1.4s, v0.4s + dup v1.4s, w2 + sub v0.4s, v1.4s, v0.4s + str q0, [sp, #48] +.LBB2_4: + mov w4, #16 + stp q16, q17, [sp, #192] + bfi x4, x0, #6, #58 + ldr q1, [x15, x4] + ldr q3, [x16, x4] + ldr q2, [x17, x4] + ldr q4, [x18, x4] + mov w4, #32 + bfi x4, x0, #6, #58 + ldr q5, [x15, x4] + ldr q20, [x16, x4] + ldr q21, [x17, x4] + ldr q22, [x18, x4] + mov w4, #48 + lsl x3, x0, #6 + bfi x4, x0, #6, #58 + add x0, x0, #1 + ldr q0, [x15, x3] + ldr q23, [x16, x3] + ldr q16, [x17, x3] + ldr q17, [x18, x3] + cmp x0, x22 + ldr q25, [x15, x4] + ldr q14, [x16, x4] + ldr q28, [x17, x4] + ldr q31, [x18, x4] + csel w4, w27, wzr, eq + orr w4, w4, w6 + mov x2, xzr + and w6, w4, #0xff + add x3, x3, #256 +.LBB2_5: + ldr x4, [x24, x2] + add x2, x2, #8 + cmp x2, #32 + add x4, x4, x3 + prfm pldl1keep, [x4] + b.ne .LBB2_5 + zip1 v29.4s, v0.4s, v23.4s + zip2 v23.4s, v0.4s, v23.4s + zip1 v0.4s, v16.4s, v17.4s + zip2 v24.4s, v16.4s, v17.4s + zip1 v9.4s, v1.4s, v3.4s + zip2 v26.4s, v1.4s, v3.4s + zip1 v27.4s, v2.4s, v4.4s + zip2 v17.4s, v2.4s, v4.4s + zip1 v12.4s, v21.4s, v22.4s + zip2 v13.4s, v21.4s, v22.4s + add v2.4s, v7.4s, v10.4s + add v1.4s, v18.4s, v11.4s + ext v7.16b, v0.16b, v29.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + zip1 v30.4s, v5.4s, v20.4s + zip2 v20.4s, v5.4s, v20.4s + stp q1, q2, [sp, #112] + ext v2.16b, v29.16b, v7.16b, #8 + mov v29.d[1], v0.d[0] + ext v18.16b, v23.16b, v22.16b, #8 + mov v23.d[1], v24.d[0] + zip1 v21.4s, v25.4s, v14.4s + zip2 v4.4s, v25.4s, v14.4s + zip1 v14.4s, v28.4s, v31.4s + zip2 v15.4s, v28.4s, v31.4s + add v8.4s, v6.4s, v19.4s + ext v28.16b, v27.16b, v9.16b, #8 + ext v31.16b, v17.16b, v26.16b, #8 + stur q2, [x29, #-208] + mov v7.16b, v29.16b + ext v0.16b, v12.16b, v30.16b, #8 + stp q23, q29, [x29, #-80] + mov v2.16b, v19.16b + ext v19.16b, v13.16b, v20.16b, #8 + mov v29.16b, v9.16b + ext v25.16b, v9.16b, v28.16b, #8 + mov v29.d[1], v27.d[0] + ext v24.16b, v26.16b, v31.16b, #8 + mov v26.d[1], v17.d[0] + ext v17.16b, v15.16b, v4.16b, #8 + ext v27.16b, v30.16b, v0.16b, #8 + ext v0.16b, v20.16b, v19.16b, #8 + stp q0, q25, [sp, #80] + ext v0.16b, v4.16b, v17.16b, #8 + str q0, [sp, #224] + ldr q0, [sp, #128] + mov v6.16b, v23.16b + mov v22.16b, v4.16b + ldr q16, [x9, :lo12:.LCPI2_1] + add v17.4s, v0.4s, v7.4s + ldr q0, [sp, #112] + mov v30.d[1], v12.d[0] + add v7.4s, v8.4s, v29.4s + mov v20.d[1], v13.d[0] + add v4.4s, v0.4s, v6.4s + ldr q0, [sp, #64] + dup v3.4s, w12 + ext v28.16b, v14.16b, v21.16b, #8 + dup v1.4s, w10 + eor v19.16b, v17.16b, v0.16b + ldr q0, [sp, #48] + ext v23.16b, v21.16b, v28.16b, #8 + mov v21.d[1], v14.d[0] + tbl v14.16b, { v19.16b }, v16.16b + eor v12.16b, v4.16b, v0.16b + movi v0.4s, #64 + eor v13.16b, v7.16b, v0.16b + tbl v13.16b, { v13.16b }, v16.16b + add v6.4s, v13.4s, v3.4s + dup v5.4s, w11 + tbl v12.16b, { v12.16b }, v16.16b + add v1.4s, v14.4s, v1.4s + eor v9.16b, v6.16b, v2.16b + ldp q2, q0, [sp, #192] + add v5.4s, v12.4s, v5.4s + eor v19.16b, v1.16b, v10.16b + eor v10.16b, v5.16b, v11.16b + ushr v11.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v11.16b, v19.16b, v11.16b + ushr v19.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + mov v22.d[1], v15.d[0] + orr v10.16b, v10.16b, v19.16b + ushr v19.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + add v15.4s, v0.4s, v2.4s + orr v9.16b, v9.16b, v19.16b + dup v19.4s, w6 + add v15.4s, v15.4s, v26.4s + eor v19.16b, v15.16b, v19.16b + tbl v3.16b, { v19.16b }, v16.16b + dup v19.4s, w13 + add v8.4s, v3.4s, v19.4s + ldur q31, [x29, #-208] + eor v19.16b, v8.16b, v2.16b + ushr v0.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v2.16b, v19.16b, v0.16b + ldr q19, [x14, :lo12:.LCPI2_2] + add v17.4s, v17.4s, v31.4s + add v17.4s, v17.4s, v11.4s + eor v14.16b, v14.16b, v17.16b + tbl v14.16b, { v14.16b }, v19.16b + add v1.4s, v1.4s, v14.4s + eor v11.16b, v1.16b, v11.16b + add v4.4s, v4.4s, v18.4s + ushr v0.4s, v11.4s, #7 + shl v11.4s, v11.4s, #25 + add v4.4s, v4.4s, v10.4s + orr v0.16b, v11.16b, v0.16b + eor v11.16b, v12.16b, v4.16b + tbl v11.16b, { v11.16b }, v19.16b + add v5.4s, v5.4s, v11.4s + eor v10.16b, v5.16b, v10.16b + add v7.4s, v7.4s, v25.4s + ushr v12.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + add v7.4s, v7.4s, v9.4s + orr v10.16b, v10.16b, v12.16b + eor v12.16b, v13.16b, v7.16b + tbl v12.16b, { v12.16b }, v19.16b + add v6.4s, v6.4s, v12.4s + eor v9.16b, v6.16b, v9.16b + ushr v13.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + orr v9.16b, v9.16b, v13.16b + add v13.4s, v15.4s, v24.4s + add v13.4s, v13.4s, v2.4s + eor v3.16b, v3.16b, v13.16b + tbl v3.16b, { v3.16b }, v19.16b + add v8.4s, v8.4s, v3.4s + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v30.4s + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v21.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v22.4s + mov v28.16b, v26.16b + stur q26, [x29, #-112] + mov v26.16b, v18.16b + mov v18.16b, v24.16b + stur q24, [x29, #-160] + add v6.4s, v6.4s, v3.4s + mov v24.16b, v20.16b + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldr q20, [sp, #80] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + stp q30, q22, [x29, #-192] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + mov v30.16b, v27.16b + add v17.4s, v17.4s, v27.4s + ldr q27, [sp, #224] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v23.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v27.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + stur q21, [x29, #-144] + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + ldur q21, [x29, #-80] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v26.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v18.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v29.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-64] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v28.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v24.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v23.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-144] + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v31.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v27.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldr q27, [sp, #96] + mov v21.16b, v26.16b + stur q26, [x29, #-96] + mov v28.16b, v31.16b + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldp q31, q26, [x29, #-192] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v20.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v27.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v26.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v31.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + mov v18.16b, v24.16b + mov v24.16b, v20.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ldur q20, [x29, #-160] + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v18.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v23.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v20.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q25, [x29, #-80] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v29.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v25.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v26.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + ldur q25, [x29, #-112] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v30.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v24.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v31.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q25, [x29, #-64] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldr q31, [sp, #224] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v27.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v25.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v31.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v28.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v26.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v23.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + mov v21.16b, v29.16b + stur q29, [x29, #-128] + mov v29.16b, v30.16b + mov v30.16b, v27.16b + mov v27.16b, v18.16b + str q18, [sp, #176] + eor v0.16b, v0.16b, v1.16b + mov v18.16b, v22.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-96] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v20.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v29.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v31.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v21.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v24.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v28.4s + add v6.4s, v6.4s, v3.4s + mov v22.16b, v24.16b + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q24, [x29, #-80] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + mov v21.16b, v30.16b + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldur q30, [x29, #-192] + mov v20.16b, v29.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + ldur q29, [x29, #-112] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v24.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v29.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v20.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v31.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v26.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v23.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v27.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v30.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + ldur q27, [x29, #-160] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v27.4s + mov v28.16b, v25.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v21.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v28.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v29.4s + mov v25.16b, v31.16b + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q31, [x29, #-96] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldur q28, [x29, #-208] + mov v18.16b, v20.16b + str q20, [sp, #144] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + ldur q20, [x29, #-128] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v24.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v31.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v28.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v20.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v30.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v25.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + add v17.4s, v17.4s, v26.4s + mov v26.16b, v21.16b + add v4.4s, v4.4s, v21.4s + ldur q21, [x29, #-144] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v21.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v28.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + str q23, [sp, #160] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + add v17.4s, v17.4s, v23.4s + ldur q23, [x29, #-64] + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v23.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v24.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v20.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + add v13.4s, v13.4s, v0.4s + ldr q20, [sp, #176] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + eor v12.16b, v12.16b, v13.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + tbl v12.16b, { v12.16b }, v16.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v31.4s + orr v2.16b, v2.16b, v15.16b + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + add v7.4s, v7.4s, v29.4s + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v2.4s + orr v0.16b, v0.16b, v15.16b + mov v15.16b, v31.16b + add v17.4s, v17.4s, v22.4s + eor v31.16b, v14.16b, v4.16b + eor v22.16b, v11.16b, v7.16b + add v11.4s, v13.4s, v27.4s + tbl v3.16b, { v3.16b }, v19.16b + add v11.4s, v11.4s, v0.4s + tbl v31.16b, { v31.16b }, v19.16b + add v6.4s, v6.4s, v3.4s + eor v12.16b, v12.16b, v11.16b + tbl v22.16b, { v22.16b }, v19.16b + add v8.4s, v8.4s, v31.4s + eor v10.16b, v6.16b, v10.16b + add v30.4s, v11.4s, v30.4s + tbl v11.16b, { v12.16b }, v19.16b + add v1.4s, v1.4s, v22.4s + eor v9.16b, v8.16b, v9.16b + ushr v12.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + add v5.4s, v5.4s, v11.4s + eor v2.16b, v1.16b, v2.16b + orr v10.16b, v10.16b, v12.16b + ushr v12.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v9.16b, v9.16b, v12.16b + ushr v12.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v12.16b + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v12.16b + add v4.4s, v4.4s, v26.4s + add v17.4s, v17.4s, v0.4s + add v7.4s, v7.4s, v28.4s + mov v18.16b, v27.16b + eor v31.16b, v31.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v27.4s, v30.4s, v2.4s + eor v22.16b, v22.16b, v4.16b + add v7.4s, v7.4s, v9.4s + eor v3.16b, v3.16b, v27.16b + add v26.4s, v27.4s, v29.4s + tbl v27.16b, { v31.16b }, v16.16b + eor v28.16b, v11.16b, v7.16b + tbl v22.16b, { v22.16b }, v16.16b + add v1.4s, v1.4s, v27.4s + add v4.4s, v4.4s, v23.4s + ldr q23, [sp, #144] + tbl v28.16b, { v28.16b }, v16.16b + tbl v3.16b, { v3.16b }, v16.16b + add v5.4s, v5.4s, v22.4s + eor v0.16b, v0.16b, v1.16b + add v6.4s, v6.4s, v28.4s + add v29.4s, v8.4s, v3.4s + eor v30.16b, v5.16b, v10.16b + ushr v8.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v31.16b, v6.16b, v9.16b + orr v0.16b, v0.16b, v8.16b + ushr v8.4s, v30.4s, #12 + shl v30.4s, v30.4s, #20 + eor v2.16b, v29.16b, v2.16b + orr v30.16b, v30.16b, v8.16b + ushr v8.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + add v17.4s, v17.4s, v25.4s + add v7.4s, v7.4s, v23.4s + orr v31.16b, v31.16b, v8.16b + ushr v8.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ldur q23, [x29, #-176] + orr v2.16b, v2.16b, v8.16b + add v17.4s, v17.4s, v0.4s + eor v27.16b, v27.16b, v17.16b + add v4.4s, v4.4s, v30.4s + add v25.4s, v26.4s, v2.4s + eor v22.16b, v22.16b, v4.16b + add v4.4s, v4.4s, v24.4s + add v7.4s, v7.4s, v31.4s + eor v3.16b, v3.16b, v25.16b + add v24.4s, v25.4s, v18.4s + tbl v25.16b, { v27.16b }, v19.16b + add v17.4s, v17.4s, v23.4s + eor v23.16b, v28.16b, v7.16b + tbl v22.16b, { v22.16b }, v19.16b + add v1.4s, v1.4s, v25.4s + tbl v23.16b, { v23.16b }, v19.16b + tbl v3.16b, { v3.16b }, v19.16b + add v5.4s, v5.4s, v22.4s + eor v0.16b, v0.16b, v1.16b + add v6.4s, v6.4s, v23.4s + add v26.4s, v29.4s, v3.4s + eor v27.16b, v5.16b, v30.16b + ushr v29.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v28.16b, v6.16b, v31.16b + orr v0.16b, v0.16b, v29.16b + ushr v29.4s, v27.4s, #7 + shl v27.4s, v27.4s, #25 + eor v2.16b, v26.16b, v2.16b + orr v27.16b, v27.16b, v29.16b + ushr v29.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + ldur q18, [x29, #-128] + orr v28.16b, v28.16b, v29.16b + ushr v29.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v7.4s, v7.4s, v15.4s + orr v2.16b, v2.16b, v29.16b + add v17.4s, v17.4s, v27.4s + add v4.4s, v4.4s, v28.4s + add v7.4s, v7.4s, v2.4s + eor v3.16b, v3.16b, v17.16b + add v17.4s, v17.4s, v20.4s + eor v20.16b, v25.16b, v4.16b + add v4.4s, v4.4s, v21.4s + eor v21.16b, v22.16b, v7.16b + add v7.4s, v7.4s, v18.4s + add v18.4s, v24.4s, v0.4s + eor v22.16b, v23.16b, v18.16b + ldr q23, [sp, #160] + tbl v3.16b, { v3.16b }, v16.16b + tbl v20.16b, { v20.16b }, v16.16b + add v6.4s, v6.4s, v3.4s + add v18.4s, v18.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + tbl v16.16b, { v22.16b }, v16.16b + add v22.4s, v26.4s, v20.4s + eor v23.16b, v6.16b, v27.16b + add v1.4s, v1.4s, v21.4s + eor v24.16b, v22.16b, v28.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v5.4s, v5.4s, v16.4s + eor v2.16b, v1.16b, v2.16b + orr v23.16b, v23.16b, v25.16b + ushr v25.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + eor v0.16b, v5.16b, v0.16b + orr v24.16b, v24.16b, v25.16b + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + ushr v25.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + orr v0.16b, v0.16b, v25.16b + add v25.4s, v7.4s, v2.4s + add v26.4s, v18.4s, v0.4s + eor v18.16b, v21.16b, v25.16b + add v17.4s, v17.4s, v23.4s + add v4.4s, v4.4s, v24.4s + eor v16.16b, v16.16b, v26.16b + tbl v21.16b, { v18.16b }, v19.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v20.16b, v4.16b + tbl v16.16b, { v16.16b }, v19.16b + add v1.4s, v1.4s, v21.4s + tbl v3.16b, { v3.16b }, v19.16b + tbl v20.16b, { v7.16b }, v19.16b + eor v2.16b, v1.16b, v2.16b + eor v7.16b, v1.16b, v17.16b + add v1.4s, v5.4s, v16.4s + eor v0.16b, v1.16b, v0.16b + eor v18.16b, v1.16b, v4.16b + add v1.4s, v6.4s, v3.4s + eor v4.16b, v1.16b, v23.16b + eor v6.16b, v25.16b, v1.16b + add v1.4s, v22.4s, v20.4s + eor v5.16b, v1.16b, v24.16b + eor v17.16b, v26.16b, v1.16b + ushr v1.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + orr v1.16b, v4.16b, v1.16b + ushr v4.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v4.16b, v5.16b, v4.16b + ushr v5.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v5.16b + ushr v5.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v5.16b + eor v10.16b, v0.16b, v20.16b + eor v11.16b, v1.16b, v21.16b + eor v19.16b, v4.16b, v16.16b + cmp x0, x22 + eor v16.16b, v2.16b, v3.16b + mov w6, w19 + b.ne .LBB2_4 +.LBB2_7: + zip1 v0.4s, v7.4s, v18.4s + zip2 v1.4s, v7.4s, v18.4s + zip1 v2.4s, v6.4s, v17.4s + zip2 v3.4s, v6.4s, v17.4s + zip1 v4.4s, v10.4s, v11.4s + zip2 v5.4s, v10.4s, v11.4s + zip1 v6.4s, v19.4s, v16.4s + zip2 v7.4s, v19.4s, v16.4s + add x15, x20, #4 + tst w5, #0x1 + sub x28, x28, #4 + zip1 v16.2d, v0.2d, v2.2d + zip2 v0.2d, v0.2d, v2.2d + zip1 v2.2d, v1.2d, v3.2d + zip2 v1.2d, v1.2d, v3.2d + zip1 v3.2d, v4.2d, v6.2d + zip2 v4.2d, v4.2d, v6.2d + zip1 v6.2d, v5.2d, v7.2d + zip2 v5.2d, v5.2d, v7.2d + add x24, x24, #32 + csel x20, x15, x20, ne + cmp x28, #3 + stp q16, q3, [x26] + stp q0, q4, [x26, #32] + stp q2, q6, [x26, #64] + stp q1, q5, [x26, #96] + add x26, x26, #128 + b.hi .LBB2_2 +.LBB2_8: + cbz x28, .LBB2_16 + orr w8, w7, w19 + and x21, x5, #0x1 + stur w8, [x29, #-64] +.LBB2_10: + ldr x8, [sp, #40] + ldr x25, [x24] + ldur w4, [x29, #-64] + ldp q1, q0, [x8] + mov x8, x22 + stp q1, q0, [x29, #-48] +.LBB2_11: + subs x23, x8, #1 + b.eq .LBB2_13 + cbnz x8, .LBB2_14 + b .LBB2_15 +.LBB2_13: + orr w4, w4, w27 +.LBB2_14: + sub x0, x29, #48 + mov w2, #64 + mov x1, x25 + mov x3, x20 + bl zfs_blake3_compress_in_place_sse41 + add x25, x25, #64 + mov x8, x23 + mov w4, w19 + b .LBB2_11 +.LBB2_15: + ldp q0, q1, [x29, #-48] + add x20, x20, x21 + add x24, x24, #8 + subs x28, x28, #1 + stp q0, q1, [x26], #32 + b.ne .LBB2_10 +.LBB2_16: + add sp, sp, #448 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] + ldp d9, d8, [sp, #48] + ldp d11, d10, [sp, #32] + ldp d13, d12, [sp, #16] + ldp d15, d14, [sp], #160 + ret +.Lfunc_end2: + .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S new file mode 100644 index 000000000..9deba202f --- /dev/null +++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S @@ -0,0 +1,2823 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE2 -> POWER8 PPC64 Little Endian + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + .text + .abiversion 2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI0_1: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI0_2: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_3: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_4: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_5: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_6: + .short 1 + .short 2 + .short 4 + .short 8 + .short 16 + .short 32 + .short 64 + .short 128 +.LCPI0_7: + .short 0 + .short 0 + .short 4 + .short 8 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI0_8: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_9: + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI0_10: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_12: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_13: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_14: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .text + .globl zfs_blake3_compress_in_place_sse2 + .p2align 2 + .type zfs_blake3_compress_in_place_sse2,@function +zfs_blake3_compress_in_place_sse2: +.Lfunc_begin0: + .cfi_startproc +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry zfs_blake3_compress_in_place_sse2, .Lfunc_lep0-.Lfunc_gep0 + li 8, -64 + mtvsrd 35, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 12, 9 + stxvd2x 60, 1, 8 + li 8, -48 + mtvsrd 36, 7 + lfd 2, 16(4) + stxvd2x 61, 1, 8 + li 8, -32 + lfd 1, 8(4) + mtvsrwz 37, 6 + rldicl 6, 6, 32, 32 + addis 7, 2, .LCPI0_2@toc@ha + stxvd2x 62, 1, 8 + li 8, -16 + addi 7, 7, .LCPI0_2@toc@l + stxvd2x 63, 1, 8 + li 8, 0 + lvx 9, 0, 7 + li 7, 48 + mtvsrd 34, 8 + xxmrghd 32, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + lfd 3, 24(4) + addis 8, 2, .LCPI0_5@toc@ha + vmrghb 3, 2, 3 + addi 8, 8, .LCPI0_5@toc@l + vmrghb 4, 2, 4 + vspltb 2, 2, 7 + xxmrghd 33, 3, 2 + vpkudum 7, 1, 0 + vmrglh 3, 2, 3 + vmrglh 2, 2, 4 + mtvsrwz 36, 6 + addis 6, 2, .LCPI0_0@toc@ha + addi 6, 6, .LCPI0_0@toc@l + vperm 10, 1, 0, 9 + vmrghw 4, 4, 5 + xxswapd 37, 1 + lxvd2x 1, 4, 7 + addis 7, 2, .LCPI0_8@toc@ha + addi 7, 7, .LCPI0_8@toc@l + vmrglw 2, 2, 3 + xxswapd 35, 0 + xxswapd 41, 1 + xxspltd 62, 42, 1 + vadduwm 3, 7, 3 + vadduwm 6, 3, 5 + xxmrgld 36, 34, 36 + lvx 2, 0, 6 + addis 6, 2, .LCPI0_1@toc@ha + addi 6, 6, .LCPI0_1@toc@l + xxlxor 35, 38, 36 + lvx 4, 0, 6 + li 6, 32 + lxvd2x 0, 4, 6 + addis 4, 2, .LCPI0_3@toc@ha + addis 6, 2, .LCPI0_7@toc@ha + vperm 8, 3, 3, 2 + vspltisw 3, 10 + addi 4, 4, .LCPI0_3@toc@l + addi 6, 6, .LCPI0_7@toc@l + vadduwm 3, 3, 3 + vadduwm 11, 8, 4 + xxlxor 36, 43, 37 + vadduwm 5, 6, 10 + vrlw 0, 4, 3 + vspltisw 4, 12 + vadduwm 4, 4, 4 + vadduwm 1, 0, 5 + xxlxor 37, 33, 40 + xxswapd 40, 0 + vrlw 6, 5, 4 + vspltisw 5, -16 + vpkudum 13, 9, 8 + vsubuwm 5, 12, 5 + lvx 12, 0, 4 + addis 4, 2, .LCPI0_4@toc@ha + addi 4, 4, .LCPI0_4@toc@l + vadduwm 11, 6, 11 + xxswapd 0, 38 + vadduwm 1, 1, 13 + xxsldwi 50, 45, 45, 1 + xxlxor 32, 43, 32 + xxsldwi 43, 43, 43, 3 + xxsldwi 33, 33, 33, 1 + vperm 12, 8, 9, 12 + vrlw 0, 0, 5 + vadduwm 1, 0, 1 + xxlxor 38, 33, 0 + vadduwm 1, 1, 12 + vperm 6, 6, 6, 2 + vadduwm 15, 6, 11 + lvx 11, 0, 4 + addis 4, 2, .LCPI0_6@toc@ha + addi 4, 4, .LCPI0_6@toc@l + xxlxor 32, 47, 32 + lvx 17, 0, 4 + addis 4, 2, .LCPI0_9@toc@ha + vperm 14, 10, 7, 11 + addi 4, 4, .LCPI0_9@toc@l + vrlw 0, 0, 3 + vadduwm 1, 0, 1 + xxlxor 38, 33, 38 + vrlw 6, 6, 4 + vadduwm 8, 6, 15 + xxswapd 0, 38 + lvx 6, 0, 8 + xxlxor 32, 40, 32 + xxsldwi 40, 40, 40, 1 + vperm 13, 12, 18, 6 + vrlw 9, 0, 5 + vadduwm 0, 1, 14 + lvx 1, 0, 7 + xxsldwi 46, 46, 46, 3 + xxsldwi 32, 32, 32, 3 + vperm 7, 7, 7, 1 + vadduwm 15, 9, 0 + xxlxor 32, 47, 0 + vperm 16, 0, 0, 2 + lvx 0, 0, 6 + addis 6, 2, .LCPI0_10@toc@ha + vcmpequh 0, 0, 17 + vadduwm 19, 16, 8 + xxlxor 40, 51, 41 + xxsel 45, 39, 45, 32 + vrlw 31, 8, 3 + lvx 8, 0, 4 + addis 4, 2, .LCPI0_11@toc@ha + addi 4, 4, .LCPI0_11@toc@l + vcmpequh 7, 8, 17 + vadduwm 8, 15, 13 + vadduwm 15, 31, 8 + lvx 8, 0, 4 + addi 4, 6, .LCPI0_10@toc@l + lvx 17, 0, 4 + addis 4, 2, .LCPI0_12@toc@ha + xxlxor 41, 47, 48 + xxsldwi 47, 47, 47, 1 + addi 4, 4, .LCPI0_12@toc@l + xxlnor 48, 39, 39 + vrlw 29, 9, 4 + vperm 9, 16, 16, 8 + xxland 48, 50, 39 + vperm 17, 30, 12, 17 + vperm 16, 16, 16, 8 + vmrghw 12, 12, 10 + lvx 10, 0, 4 + addis 4, 2, .LCPI0_13@toc@ha + vadduwm 19, 29, 19 + addi 4, 4, .LCPI0_13@toc@l + xxlxor 63, 51, 63 + xxsldwi 51, 51, 51, 3 + xxland 0, 49, 41 + vrlw 17, 31, 5 + xxlor 48, 0, 48 + xxswapd 0, 61 + vperm 18, 12, 18, 10 + vadduwm 15, 15, 16 + xxland 60, 48, 39 + vadduwm 15, 17, 15 + vperm 28, 28, 28, 8 + xxlxor 63, 47, 0 + vadduwm 15, 15, 18 + vperm 31, 31, 31, 2 + vperm 30, 18, 16, 6 + vadduwm 19, 31, 19 + xxlxor 44, 51, 49 + vrlw 12, 12, 3 + vadduwm 15, 12, 15 + xxlxor 49, 47, 63 + vperm 31, 13, 14, 11 + vrlw 17, 17, 4 + vperm 14, 14, 14, 1 + vadduwm 15, 15, 31 + vadduwm 19, 17, 19 + xxswapd 0, 49 + xxsldwi 47, 47, 47, 3 + xxsel 46, 46, 62, 32 + xxlxor 44, 51, 44 + xxsldwi 51, 51, 51, 1 + vrlw 12, 12, 5 + vadduwm 15, 12, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 19, 17, 19 + xxlxor 44, 51, 44 + vrlw 29, 12, 3 + vadduwm 12, 15, 14 + vadduwm 15, 29, 12 + lvx 12, 0, 4 + addis 4, 2, .LCPI0_14@toc@ha + addi 4, 4, .LCPI0_14@toc@l + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + vperm 30, 13, 18, 12 + vrlw 17, 17, 4 + vmrghw 13, 18, 13 + xxland 0, 62, 41 + vadduwm 19, 17, 19 + vperm 16, 13, 16, 10 + xxlxor 61, 51, 61 + xxsldwi 50, 51, 51, 3 + xxsldwi 51, 63, 63, 3 + vrlw 30, 29, 5 + xxlor 61, 60, 0 + xxswapd 0, 49 + vperm 31, 14, 19, 11 + vadduwm 15, 15, 29 + vperm 19, 19, 19, 1 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 16 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 62 + vperm 30, 16, 29, 6 + vrlw 13, 13, 3 + vadduwm 15, 13, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 31 + xxsldwi 63, 63, 63, 3 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 45, 50, 45 + xxsldwi 50, 50, 50, 1 + vrlw 13, 13, 5 + vadduwm 15, 13, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 45 + vrlw 28, 13, 3 + xxsel 45, 51, 62, 32 + xxland 51, 61, 39 + vperm 30, 14, 16, 12 + vadduwm 15, 15, 13 + vperm 19, 19, 19, 8 + vmrghw 14, 16, 14 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 62, 41 + vrlw 17, 17, 4 + xxlor 51, 51, 0 + vadduwm 15, 15, 19 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 60, 50, 60 + xxsldwi 48, 50, 50, 3 + vperm 18, 14, 29, 10 + vrlw 30, 28, 5 + vperm 29, 18, 19, 6 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 18 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 62 + vperm 30, 13, 31, 11 + vrlw 14, 14, 3 + vperm 31, 31, 31, 1 + vadduwm 15, 14, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 46, 48, 46 + xxsldwi 48, 48, 48, 1 + vrlw 14, 14, 5 + vadduwm 15, 14, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 46 + vrlw 28, 14, 3 + xxsel 46, 63, 61, 32 + xxland 63, 51, 39 + vperm 29, 13, 18, 12 + vadduwm 15, 15, 14 + vperm 31, 31, 31, 8 + vmrghw 13, 18, 13 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 61, 41 + vrlw 17, 17, 4 + xxlor 63, 63, 0 + vperm 13, 13, 19, 10 + xxsldwi 51, 62, 62, 3 + vadduwm 15, 15, 31 + vperm 30, 14, 19, 11 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 60, 48, 60 + xxsldwi 48, 48, 48, 3 + vrlw 29, 28, 5 + vadduwm 15, 29, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 50, 48, 61 + vrlw 18, 18, 3 + vadduwm 15, 18, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 11, 17, 16 + xxswapd 0, 49 + xxlxor 48, 43, 50 + xxsldwi 43, 43, 43, 1 + vperm 18, 19, 19, 1 + vrlw 16, 16, 5 + vperm 19, 13, 31, 6 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 29, 17, 11 + xxlxor 43, 61, 48 + vrlw 16, 11, 3 + xxsel 43, 50, 51, 32 + xxland 50, 63, 39 + vperm 19, 14, 13, 12 + vadduwm 15, 15, 11 + vperm 18, 18, 18, 8 + vmrghw 13, 13, 14 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 51, 41 + lvx 19, 0, 4 + vrlw 17, 17, 4 + xxlor 50, 50, 0 + vperm 13, 13, 31, 10 + xxsldwi 63, 62, 62, 3 + vadduwm 15, 15, 18 + vperm 19, 11, 31, 19 + vadduwm 29, 17, 29 + xxswapd 0, 49 + vperm 1, 31, 31, 1 + xxlxor 48, 61, 48 + xxsldwi 46, 61, 61, 3 + vperm 6, 13, 18, 6 + vrlw 16, 16, 5 + xxsel 32, 33, 38, 32 + xxland 38, 50, 39 + vadduwm 15, 16, 15 + vperm 7, 11, 13, 12 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vperm 6, 6, 6, 8 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 3 + vrlw 17, 17, 4 + vadduwm 15, 15, 19 + vadduwm 14, 17, 14 + xxswapd 0, 49 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 5 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vadduwm 0, 15, 0 + vperm 17, 17, 17, 2 + xxland 0, 39, 41 + xxlor 38, 38, 0 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 0, 16, 0 + xxlxor 33, 32, 49 + xxsldwi 32, 32, 32, 1 + vrlw 1, 1, 4 + vadduwm 0, 0, 6 + vadduwm 8, 1, 14 + xxswapd 0, 33 + xxlxor 44, 40, 48 + xxsldwi 38, 40, 40, 3 + vrlw 7, 12, 5 + vadduwm 0, 7, 0 + xxlxor 33, 32, 0 + vperm 2, 1, 1, 2 + vmrghw 1, 13, 11 + vadduwm 6, 2, 6 + vperm 1, 1, 18, 10 + xxlxor 39, 38, 39 + vrlw 3, 7, 3 + vadduwm 0, 0, 1 + vadduwm 0, 3, 0 + xxlxor 34, 32, 34 + xxsldwi 0, 32, 32, 3 + vrlw 2, 2, 4 + vadduwm 4, 2, 6 + xxswapd 2, 34 + xxlxor 35, 36, 35 + xxsldwi 1, 36, 36, 1 + vrlw 3, 3, 5 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 1, 35, 2 + stxvd2x 0, 0, 3 + xxswapd 1, 1 + stxvd2x 1, 3, 5 + li 3, -16 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-.Lfunc_begin0 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI1_1: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI1_2: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_3: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_4: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_5: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_6: + .short 1 + .short 2 + .short 4 + .short 8 + .short 16 + .short 32 + .short 64 + .short 128 +.LCPI1_7: + .short 0 + .short 0 + .short 4 + .short 8 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI1_8: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_9: + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI1_10: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_12: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_13: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_14: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .text + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: +.Lfunc_begin1: + .cfi_startproc +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry zfs_blake3_compress_xof_sse2, .Lfunc_lep1-.Lfunc_gep1 + li 9, -80 + mtvsrd 35, 5 + li 5, 16 + lfdx 0, 0, 4 + addis 10, 2, .LCPI1_2@toc@ha + vspltisw 12, 9 + std 30, -16(1) + addis 12, 2, .LCPI1_8@toc@ha + addis 30, 2, .LCPI1_5@toc@ha + addis 11, 2, .LCPI1_7@toc@ha + stxvd2x 60, 1, 9 + li 9, -64 + mtvsrd 36, 7 + lfd 2, 16(4) + addi 10, 10, .LCPI1_2@toc@l + addi 12, 12, .LCPI1_8@toc@l + addi 11, 11, .LCPI1_7@toc@l + stxvd2x 61, 1, 9 + li 9, -48 + lfd 3, 24(4) + mtvsrwz 37, 6 + rldicl 6, 6, 32, 32 + lvx 9, 0, 10 + stxvd2x 62, 1, 9 + li 9, -32 + li 10, 32 + stxvd2x 63, 1, 9 + li 9, 0 + mtvsrd 34, 9 + xxmrghd 33, 3, 2 + lfd 1, 8(4) + vmrghb 3, 2, 3 + vmrghb 4, 2, 4 + vspltb 2, 2, 7 + xxmrghd 32, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + vpkudum 7, 1, 0 + vmrglh 3, 2, 3 + vmrglh 2, 2, 4 + mtvsrwz 36, 6 + addis 6, 2, .LCPI1_0@toc@ha + addi 6, 6, .LCPI1_0@toc@l + vperm 10, 1, 0, 9 + vmrghw 4, 4, 5 + xxswapd 37, 1 + vmrglw 2, 2, 3 + xxswapd 35, 0 + lxvd2x 0, 4, 10 + xxspltd 62, 42, 1 + vadduwm 3, 7, 3 + vadduwm 6, 3, 5 + xxmrgld 36, 34, 36 + lvx 2, 0, 6 + addis 6, 2, .LCPI1_1@toc@ha + addi 6, 6, .LCPI1_1@toc@l + xxlxor 35, 38, 36 + lvx 4, 0, 6 + li 6, 48 + lxvd2x 1, 4, 6 + addis 4, 2, .LCPI1_3@toc@ha + vperm 8, 3, 3, 2 + vspltisw 3, 10 + addi 4, 4, .LCPI1_3@toc@l + xxswapd 41, 1 + vadduwm 3, 3, 3 + vadduwm 11, 8, 4 + xxlxor 36, 43, 37 + vadduwm 5, 6, 10 + vrlw 0, 4, 3 + vspltisw 4, 12 + vadduwm 4, 4, 4 + vadduwm 1, 0, 5 + xxlxor 37, 33, 40 + xxswapd 40, 0 + vrlw 6, 5, 4 + vspltisw 5, -16 + vpkudum 13, 9, 8 + vsubuwm 5, 12, 5 + lvx 12, 0, 4 + addis 4, 2, .LCPI1_4@toc@ha + addi 4, 4, .LCPI1_4@toc@l + vadduwm 11, 6, 11 + xxswapd 0, 38 + vadduwm 1, 1, 13 + xxsldwi 50, 45, 45, 1 + xxlxor 32, 43, 32 + xxsldwi 43, 43, 43, 3 + xxsldwi 33, 33, 33, 1 + vperm 12, 8, 9, 12 + vrlw 0, 0, 5 + vadduwm 1, 0, 1 + xxlxor 38, 33, 0 + vadduwm 1, 1, 12 + vperm 6, 6, 6, 2 + vadduwm 15, 6, 11 + lvx 11, 0, 4 + addis 4, 2, .LCPI1_6@toc@ha + addi 4, 4, .LCPI1_6@toc@l + xxlxor 32, 47, 32 + lvx 17, 0, 4 + addi 4, 30, .LCPI1_5@toc@l + vperm 14, 10, 7, 11 + vrlw 0, 0, 3 + vadduwm 1, 0, 1 + xxlxor 38, 33, 38 + vrlw 6, 6, 4 + vadduwm 8, 6, 15 + xxswapd 0, 38 + lvx 6, 0, 4 + addis 4, 2, .LCPI1_9@toc@ha + addi 4, 4, .LCPI1_9@toc@l + xxlxor 32, 40, 32 + xxsldwi 40, 40, 40, 1 + vperm 13, 12, 18, 6 + vrlw 9, 0, 5 + vadduwm 0, 1, 14 + lvx 1, 0, 12 + xxsldwi 46, 46, 46, 3 + xxsldwi 32, 32, 32, 3 + vperm 7, 7, 7, 1 + vadduwm 15, 9, 0 + xxlxor 32, 47, 0 + vperm 16, 0, 0, 2 + lvx 0, 0, 11 + addis 11, 2, .LCPI1_10@toc@ha + vcmpequh 0, 0, 17 + vadduwm 19, 16, 8 + xxlxor 40, 51, 41 + xxsel 45, 39, 45, 32 + vrlw 31, 8, 3 + lvx 8, 0, 4 + addis 4, 2, .LCPI1_11@toc@ha + addi 4, 4, .LCPI1_11@toc@l + vcmpequh 7, 8, 17 + vadduwm 8, 15, 13 + vadduwm 15, 31, 8 + lvx 8, 0, 4 + addi 4, 11, .LCPI1_10@toc@l + lvx 17, 0, 4 + addis 4, 2, .LCPI1_12@toc@ha + xxlxor 41, 47, 48 + xxsldwi 47, 47, 47, 1 + addi 4, 4, .LCPI1_12@toc@l + xxlnor 48, 39, 39 + vrlw 29, 9, 4 + vperm 9, 16, 16, 8 + xxland 48, 50, 39 + vperm 17, 30, 12, 17 + vperm 16, 16, 16, 8 + vmrghw 12, 12, 10 + lvx 10, 0, 4 + addis 4, 2, .LCPI1_13@toc@ha + vadduwm 19, 29, 19 + addi 4, 4, .LCPI1_13@toc@l + xxlxor 63, 51, 63 + xxsldwi 51, 51, 51, 3 + xxland 0, 49, 41 + vrlw 17, 31, 5 + xxlor 48, 0, 48 + xxswapd 0, 61 + vperm 18, 12, 18, 10 + vadduwm 15, 15, 16 + xxland 60, 48, 39 + vadduwm 15, 17, 15 + vperm 28, 28, 28, 8 + xxlxor 63, 47, 0 + vadduwm 15, 15, 18 + vperm 31, 31, 31, 2 + vperm 30, 18, 16, 6 + vadduwm 19, 31, 19 + xxlxor 44, 51, 49 + vrlw 12, 12, 3 + vadduwm 15, 12, 15 + xxlxor 49, 47, 63 + vperm 31, 13, 14, 11 + vrlw 17, 17, 4 + vperm 14, 14, 14, 1 + vadduwm 15, 15, 31 + vadduwm 19, 17, 19 + xxswapd 0, 49 + xxsldwi 47, 47, 47, 3 + xxsel 46, 46, 62, 32 + xxlxor 44, 51, 44 + xxsldwi 51, 51, 51, 1 + vrlw 12, 12, 5 + vadduwm 15, 12, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 19, 17, 19 + xxlxor 44, 51, 44 + vrlw 29, 12, 3 + vadduwm 12, 15, 14 + vadduwm 15, 29, 12 + lvx 12, 0, 4 + addis 4, 2, .LCPI1_14@toc@ha + addi 4, 4, .LCPI1_14@toc@l + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + vperm 30, 13, 18, 12 + vrlw 17, 17, 4 + vmrghw 13, 18, 13 + xxland 0, 62, 41 + vadduwm 19, 17, 19 + vperm 16, 13, 16, 10 + xxlxor 61, 51, 61 + xxsldwi 50, 51, 51, 3 + xxsldwi 51, 63, 63, 3 + vrlw 30, 29, 5 + xxlor 61, 60, 0 + xxswapd 0, 49 + vperm 31, 14, 19, 11 + vadduwm 15, 15, 29 + vperm 19, 19, 19, 1 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 16 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 62 + vperm 30, 16, 29, 6 + vrlw 13, 13, 3 + vadduwm 15, 13, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 31 + xxsldwi 63, 63, 63, 3 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 45, 50, 45 + xxsldwi 50, 50, 50, 1 + vrlw 13, 13, 5 + vadduwm 15, 13, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 45 + vrlw 28, 13, 3 + xxsel 45, 51, 62, 32 + xxland 51, 61, 39 + vperm 30, 14, 16, 12 + vadduwm 15, 15, 13 + vperm 19, 19, 19, 8 + vmrghw 14, 16, 14 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 62, 41 + vrlw 17, 17, 4 + xxlor 51, 51, 0 + vadduwm 15, 15, 19 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 60, 50, 60 + xxsldwi 48, 50, 50, 3 + vperm 18, 14, 29, 10 + vrlw 30, 28, 5 + vperm 29, 18, 19, 6 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 18 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 62 + vperm 30, 13, 31, 11 + vrlw 14, 14, 3 + vperm 31, 31, 31, 1 + vadduwm 15, 14, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 46, 48, 46 + xxsldwi 48, 48, 48, 1 + vrlw 14, 14, 5 + vadduwm 15, 14, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 46 + vrlw 28, 14, 3 + xxsel 46, 63, 61, 32 + xxland 63, 51, 39 + vperm 29, 13, 18, 12 + vadduwm 15, 15, 14 + vperm 31, 31, 31, 8 + vmrghw 13, 18, 13 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 61, 41 + vrlw 17, 17, 4 + xxlor 63, 63, 0 + vperm 13, 13, 19, 10 + xxsldwi 51, 62, 62, 3 + vadduwm 15, 15, 31 + vperm 30, 14, 19, 11 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 60, 48, 60 + xxsldwi 48, 48, 48, 3 + vrlw 29, 28, 5 + vadduwm 15, 29, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 50, 48, 61 + vrlw 18, 18, 3 + vadduwm 15, 18, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 11, 17, 16 + xxswapd 0, 49 + xxlxor 48, 43, 50 + xxsldwi 43, 43, 43, 1 + vperm 18, 19, 19, 1 + vrlw 16, 16, 5 + vperm 19, 13, 31, 6 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 29, 17, 11 + xxlxor 43, 61, 48 + vrlw 16, 11, 3 + xxsel 43, 50, 51, 32 + xxland 50, 63, 39 + vperm 19, 14, 13, 12 + vadduwm 15, 15, 11 + vperm 18, 18, 18, 8 + vmrghw 13, 13, 14 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 51, 41 + lvx 19, 0, 4 + vrlw 17, 17, 4 + xxlor 50, 50, 0 + vperm 13, 13, 31, 10 + xxsldwi 63, 62, 62, 3 + vadduwm 15, 15, 18 + vperm 19, 11, 31, 19 + vadduwm 29, 17, 29 + xxswapd 0, 49 + vperm 1, 31, 31, 1 + xxlxor 48, 61, 48 + xxsldwi 46, 61, 61, 3 + vperm 6, 13, 18, 6 + vrlw 16, 16, 5 + xxsel 32, 33, 38, 32 + xxland 38, 50, 39 + vadduwm 15, 16, 15 + vperm 7, 11, 13, 12 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vperm 6, 6, 6, 8 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 3 + vrlw 17, 17, 4 + vadduwm 15, 15, 19 + vadduwm 14, 17, 14 + xxswapd 0, 49 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 5 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vadduwm 0, 15, 0 + vperm 17, 17, 17, 2 + xxland 0, 39, 41 + xxlor 38, 38, 0 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 0, 16, 0 + xxlxor 33, 32, 49 + xxsldwi 32, 32, 32, 1 + vrlw 1, 1, 4 + vadduwm 0, 0, 6 + vadduwm 8, 1, 14 + xxswapd 0, 33 + xxlxor 44, 40, 48 + xxsldwi 38, 40, 40, 3 + vrlw 7, 12, 5 + vadduwm 0, 7, 0 + xxlxor 33, 32, 0 + vperm 2, 1, 1, 2 + vmrghw 1, 13, 11 + vadduwm 6, 2, 6 + vperm 1, 1, 18, 10 + xxlxor 39, 38, 39 + vrlw 3, 7, 3 + vadduwm 0, 0, 1 + vadduwm 0, 3, 0 + xxlxor 34, 32, 34 + xxsldwi 0, 32, 32, 3 + vrlw 2, 2, 4 + vadduwm 4, 2, 6 + xxswapd 2, 34 + xxlxor 35, 36, 35 + xxsldwi 1, 36, 36, 1 + vrlw 3, 3, 5 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 3, 35, 2 + stxvd2x 0, 0, 8 + xxswapd 3, 3 + stxvd2x 3, 8, 5 + lfdx 0, 0, 3 + lfd 3, 8(3) + xxmrghd 34, 3, 0 + xxlxor 0, 1, 34 + xxswapd 0, 0 + stxvd2x 0, 8, 10 + lfd 0, 16(3) + lfd 1, 24(3) + li 3, -32 + xxmrghd 34, 1, 0 + xxlxor 0, 2, 34 + xxswapd 0, 0 + stxvd2x 0, 8, 6 + lxvd2x 63, 1, 3 + li 3, -48 + ld 30, -16(1) + lxvd2x 62, 1, 3 + li 3, -64 + lxvd2x 61, 1, 3 + li 3, -80 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-.Lfunc_begin1 + .cfi_endproc + + .globl zfs_blake3_hash_many_sse2 + .p2align 2 + .type zfs_blake3_hash_many_sse2,@function +zfs_blake3_hash_many_sse2: +.Lfunc_begin2: + .cfi_startproc +.Lfunc_gep2: + addis 2, 12, .TOC.-.Lfunc_gep2@ha + addi 2, 2, .TOC.-.Lfunc_gep2@l +.Lfunc_lep2: + .localentry zfs_blake3_hash_many_sse2, .Lfunc_lep2-.Lfunc_gep2 + mfocrf 12, 32 + mflr 0 + std 0, 16(1) + stw 12, 8(1) + stdu 1, -256(1) + .cfi_def_cfa_offset 256 + .cfi_offset lr, 16 + .cfi_offset r17, -120 + .cfi_offset r18, -112 + .cfi_offset r19, -104 + .cfi_offset r20, -96 + .cfi_offset r21, -88 + .cfi_offset r22, -80 + .cfi_offset r23, -72 + .cfi_offset r24, -64 + .cfi_offset r25, -56 + .cfi_offset r26, -48 + .cfi_offset r27, -40 + .cfi_offset r28, -32 + .cfi_offset r29, -24 + .cfi_offset r30, -16 + .cfi_offset cr2, 8 + std 26, 208(1) + mr 26, 4 + cmpldi 1, 4, 4 + andi. 4, 8, 1 + std 18, 144(1) + std 19, 152(1) + crmove 8, 1 + ld 19, 360(1) + lwz 18, 352(1) + std 24, 192(1) + std 25, 200(1) + std 27, 216(1) + std 28, 224(1) + mr 24, 10 + mr 28, 6 + mr 27, 5 + mr 25, 3 + std 29, 232(1) + std 30, 240(1) + mr 30, 9 + mr 29, 7 + std 17, 136(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + blt 1, .LBB2_3 + li 3, 0 + li 4, 1 + clrldi 23, 30, 32 + isel 22, 4, 3, 8 + clrldi 21, 24, 32 + clrldi 20, 18, 32 +.LBB2_2: + mr 3, 25 + mr 4, 27 + mr 5, 28 + mr 6, 29 + mr 7, 22 + mr 8, 23 + mr 9, 21 + mr 10, 20 + std 19, 32(1) + bl blake3_hash4_sse2 + addi 26, 26, -4 + addi 3, 29, 4 + addi 25, 25, 32 + addi 19, 19, 128 + cmpldi 26, 3 + isel 29, 3, 29, 8 + bgt 0, .LBB2_2 +.LBB2_3: + cmpldi 26, 0 + beq 0, .LBB2_11 + li 3, 0 + li 4, 1 + or 21, 24, 30 + li 20, 16 + addi 24, 1, 96 + isel 22, 4, 3, 8 +.LBB2_5: + lxvd2x 0, 28, 20 + ld 23, 0(25) + mr 17, 27 + mr 3, 21 + stxvd2x 0, 24, 20 + lxvd2x 0, 0, 28 + stxvd2x 0, 0, 24 +.LBB2_6: + cmpldi 17, 1 + beq 0, .LBB2_8 + cmpldi 17, 0 + bne 0, .LBB2_9 + b .LBB2_10 +.LBB2_8: + or 3, 3, 18 +.LBB2_9: + clrldi 7, 3, 56 + mr 3, 24 + mr 4, 23 + li 5, 64 + mr 6, 29 + bl zfs_blake3_compress_in_place_sse2 + addi 23, 23, 64 + addi 17, 17, -1 + mr 3, 30 + b .LBB2_6 +.LBB2_10: + lxvd2x 0, 24, 20 + addi 26, 26, -1 + add 29, 29, 22 + addi 25, 25, 8 + cmpldi 26, 0 + stxvd2x 0, 19, 20 + lxvd2x 0, 0, 24 + stxvd2x 0, 0, 19 + addi 19, 19, 32 + bne 0, .LBB2_5 +.LBB2_11: + ld 30, 240(1) + ld 29, 232(1) + ld 28, 224(1) + ld 27, 216(1) + ld 26, 208(1) + ld 25, 200(1) + ld 24, 192(1) + ld 23, 184(1) + ld 22, 176(1) + ld 21, 168(1) + ld 20, 160(1) + ld 19, 152(1) + ld 18, 144(1) + ld 17, 136(1) + addi 1, 1, 256 + ld 0, 16(1) + lwz 12, 8(1) + mtocrf 32, 12 + mtlr 0 + blr + .long 0 + .quad 0 +.Lfunc_end2: + .size zfs_blake3_hash_many_sse2, .Lfunc_end2-.Lfunc_begin2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI3_0: + .quad 4294967296 + .quad 12884901890 +.LCPI3_1: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI3_2: + .long 1779033703 + .long 1779033703 + .long 1779033703 + .long 1779033703 +.LCPI3_3: + .long 3144134277 + .long 3144134277 + .long 3144134277 + .long 3144134277 +.LCPI3_4: + .long 1013904242 + .long 1013904242 + .long 1013904242 + .long 1013904242 +.LCPI3_5: + .long 2773480762 + .long 2773480762 + .long 2773480762 + .long 2773480762 + .text + .p2align 2 + .type blake3_hash4_sse2,@function +blake3_hash4_sse2: +.Lfunc_begin3: + .cfi_startproc +.Lfunc_gep3: + addis 2, 12, .TOC.-.Lfunc_gep3@ha + addi 2, 2, .TOC.-.Lfunc_gep3@l +.Lfunc_lep3: + .localentry blake3_hash4_sse2, .Lfunc_lep3-.Lfunc_gep3 + stdu 1, -400(1) + .cfi_def_cfa_offset 400 + .cfi_offset r22, -152 + .cfi_offset r23, -144 + .cfi_offset r24, -136 + .cfi_offset r25, -128 + .cfi_offset r26, -120 + .cfi_offset r27, -112 + .cfi_offset r28, -104 + .cfi_offset r29, -96 + .cfi_offset r30, -88 + .cfi_offset f23, -72 + .cfi_offset f24, -64 + .cfi_offset f25, -56 + .cfi_offset f26, -48 + .cfi_offset f27, -40 + .cfi_offset f28, -32 + .cfi_offset f29, -24 + .cfi_offset f30, -16 + .cfi_offset f31, -8 + .cfi_offset v20, -352 + .cfi_offset v21, -336 + .cfi_offset v22, -320 + .cfi_offset v23, -304 + .cfi_offset v24, -288 + .cfi_offset v25, -272 + .cfi_offset v26, -256 + .cfi_offset v27, -240 + .cfi_offset v28, -224 + .cfi_offset v29, -208 + .cfi_offset v30, -192 + .cfi_offset v31, -176 + li 11, 48 + li 0, 8 + std 30, 312(1) + li 30, 12 + li 12, 4 + lfiwzx 0, 0, 5 + stxvd2x 52, 1, 11 + li 11, 64 + lfiwzx 2, 5, 0 + li 0, 20 + lfiwzx 3, 5, 30 + stxvd2x 53, 1, 11 + li 11, 80 + li 30, 24 + lfiwzx 4, 5, 0 + li 0, 28 + stxvd2x 54, 1, 11 + li 11, 96 + lfiwzx 1, 5, 12 + lfiwzx 6, 5, 30 + xxspltw 45, 0, 1 + cmpldi 4, 0 + std 22, 248(1) + stxvd2x 55, 1, 11 + li 11, 112 + lfiwzx 7, 5, 0 + xxspltw 40, 2, 1 + std 23, 256(1) + xxspltw 38, 3, 1 + xxspltw 50, 4, 1 + std 24, 264(1) + std 25, 272(1) + std 26, 280(1) + xxspltw 54, 7, 1 + std 27, 288(1) + std 28, 296(1) + std 29, 304(1) + stxvd2x 56, 1, 11 + li 11, 128 + stfd 23, 328(1) + stxvd2x 57, 1, 11 + li 11, 144 + stfd 24, 336(1) + stxvd2x 58, 1, 11 + li 11, 160 + stfd 25, 344(1) + stxvd2x 59, 1, 11 + li 11, 176 + xxspltw 59, 1, 1 + stxvd2x 60, 1, 11 + li 11, 192 + stfd 26, 352(1) + stxvd2x 61, 1, 11 + li 11, 208 + stfd 27, 360(1) + stxvd2x 62, 1, 11 + li 11, 224 + xxspltw 62, 6, 1 + stxvd2x 63, 1, 11 + li 11, 16 + stfd 28, 368(1) + lfiwzx 5, 5, 11 + ld 5, 432(1) + stfd 29, 376(1) + stfd 30, 384(1) + stfd 31, 392(1) + xxspltw 61, 5, 1 + beq 0, .LBB3_5 + addis 30, 2, .LCPI3_0@toc@ha + neg 7, 7 + xxleqv 34, 34, 34 + addis 28, 2, .LCPI3_2@toc@ha + addis 27, 2, .LCPI3_3@toc@ha + addis 26, 2, .LCPI3_4@toc@ha + addis 25, 2, .LCPI3_5@toc@ha + ld 29, 24(3) + addi 0, 30, .LCPI3_0@toc@l + mtfprwz 1, 7 + addis 7, 2, .LCPI3_1@toc@ha + ld 30, 16(3) + lxvd2x 0, 0, 0 + mtfprwz 2, 6 + rldicl 6, 6, 32, 32 + addi 0, 7, .LCPI3_1@toc@l + ld 7, 8(3) + vslw 2, 2, 2 + lvx 5, 0, 0 + addi 0, 28, .LCPI3_2@toc@l + addi 28, 27, .LCPI3_3@toc@l + addi 27, 26, .LCPI3_4@toc@l + addi 26, 25, .LCPI3_5@toc@l + or 25, 9, 8 + li 9, 0 + xxspltw 36, 2, 1 + xxswapd 35, 0 + xxspltw 0, 1, 1 + xxland 35, 0, 35 + mtfprwz 0, 6 + ld 6, 0(3) + addi 3, 3, -8 + vadduwm 4, 3, 4 + xxlor 35, 35, 34 + xxlxor 34, 36, 34 + xxlor 9, 36, 36 + vspltisw 4, 4 + vcmpgtsw 2, 3, 2 + xxspltw 35, 0, 1 + xxlor 10, 36, 36 + vsubuwm 2, 3, 2 + xxlor 11, 34, 34 + lvx 2, 0, 0 + li 0, 32 + xxlor 12, 34, 34 + lvx 2, 0, 28 + li 28, 48 + xxlor 13, 34, 34 + lvx 2, 0, 27 + li 27, 0 + xxlor 31, 34, 34 + lvx 2, 0, 26 + xxlor 30, 34, 34 +.LBB3_2: + mr 26, 27 + addi 27, 27, 1 + xxlor 28, 40, 40 + cmpld 27, 4 + sldi 26, 26, 6 + xxlor 24, 45, 45 + iseleq 24, 10, 9 + add 23, 6, 26 + add 22, 30, 26 + lxvd2x 0, 6, 26 + lxvd2x 1, 7, 26 + or 25, 24, 25 + add 24, 7, 26 + lxvd2x 2, 30, 26 + lxvd2x 3, 29, 26 + xxlor 29, 38, 38 + lxvd2x 4, 23, 11 + lxvd2x 6, 24, 11 + clrlwi 25, 25, 24 + lxvd2x 7, 22, 11 + lxvd2x 8, 23, 0 + mtfprd 5, 25 + add 25, 29, 26 + xxswapd 34, 0 + lxvd2x 0, 25, 11 + xxswapd 36, 1 + xxswapd 33, 2 + lxvd2x 1, 24, 0 + lxvd2x 2, 22, 0 + xxswapd 39, 3 + xxswapd 32, 4 + lxvd2x 3, 25, 0 + lxvd2x 4, 23, 28 + xxswapd 49, 6 + xxswapd 51, 7 + lxvd2x 6, 24, 28 + xxswapd 58, 8 + lxvd2x 7, 22, 28 + lxvd2x 8, 25, 28 + xxswapd 60, 0 + mr 25, 3 + xxswapd 57, 1 + xxswapd 53, 2 + xxswapd 52, 3 + xxswapd 56, 4 + xxswapd 55, 6 + xxswapd 0, 5 + xxswapd 40, 7 + xxswapd 41, 8 + mtctr 12 +.LBB3_3: + ldu 24, 8(25) + add 24, 24, 26 + addi 24, 24, 256 + dcbt 0, 24 + bdnz .LBB3_3 + vmrgew 3, 4, 2 + vspltisw 31, 9 + mr 25, 8 + vmrglw 10, 4, 2 + vspltisw 14, 10 + vmrghw 6, 4, 2 + xxspltw 0, 0, 3 + vmrgew 4, 17, 0 + vmrglw 11, 17, 0 + vmrghw 16, 17, 0 + vmrgew 0, 25, 26 + vmrgew 13, 7, 1 + vmrglw 2, 7, 1 + vmrghw 7, 7, 1 + xxlor 25, 36, 36 + vmrgew 4, 28, 19 + xxlor 26, 32, 32 + vmrglw 0, 25, 26 + vmrglw 1, 28, 19 + xxmrgld 47, 34, 42 + xxlor 44, 28, 28 + vmrghw 25, 25, 26 + xxlor 23, 36, 36 + vmrghw 4, 28, 19 + vspltisw 19, -16 + xxlor 5, 32, 32 + vmrgew 0, 20, 21 + xxmrgld 34, 33, 43 + vmrglw 28, 20, 21 + vmrghw 21, 20, 21 + vmrglw 20, 23, 24 + vmrghw 26, 23, 24 + vmrglw 17, 9, 8 + xxlor 8, 32, 32 + vmrgew 0, 23, 24 + xxmrgld 56, 39, 38 + vmrgew 23, 9, 8 + xxlor 33, 24, 24 + xxlor 2, 34, 34 + vadduwm 11, 15, 1 + xxmrgld 33, 36, 48 + xxlor 6, 47, 47 + xxlor 27, 32, 32 + vmrghw 0, 9, 8 + vspltisw 9, 12 + vsubuwm 8, 31, 19 + xxmrgld 51, 23, 25 + vadduwm 31, 2, 12 + xxlor 34, 10, 10 + vadduwm 10, 14, 14 + vslw 15, 2, 2 + xxlor 34, 29, 29 + vadduwm 14, 24, 27 + xxlor 24, 48, 48 + vadduwm 16, 1, 2 + xxmrgld 34, 45, 35 + vadduwm 31, 31, 30 + xxmrghd 36, 36, 24 + vadduwm 11, 11, 29 + vadduwm 14, 14, 18 + vadduwm 13, 16, 22 + xxlxor 47, 63, 47 + xxlor 1, 9, 9 + xxlor 1, 11, 11 + xxlxor 48, 43, 9 + vadduwm 11, 11, 2 + xxlor 7, 34, 34 + xxmrghd 34, 39, 38 + xxlxor 39, 46, 11 + xxlor 1, 50, 50 + xxlxor 50, 45, 0 + vperm 15, 15, 15, 5 + vperm 16, 16, 16, 5 + vperm 7, 7, 7, 5 + vperm 18, 18, 18, 5 + xxlor 4, 33, 33 + xxlor 33, 31, 31 + vadduwm 14, 14, 2 + xxlor 3, 34, 34 + xxlor 34, 12, 12 + xxlor 35, 13, 13 + vadduwm 6, 15, 1 + xxlor 33, 30, 30 + vadduwm 2, 16, 2 + vadduwm 3, 7, 3 + vadduwm 12, 18, 1 + xxlxor 59, 34, 61 + xxlxor 61, 35, 1 + xxlxor 33, 38, 62 + xxlxor 62, 44, 54 + vrlw 22, 27, 10 + vrlw 29, 29, 10 + vrlw 1, 1, 10 + vrlw 30, 30, 10 + vadduwm 31, 31, 19 + vadduwm 13, 13, 4 + vadduwm 11, 22, 11 + vadduwm 14, 29, 14 + vadduwm 31, 1, 31 + vadduwm 13, 30, 13 + vadduwm 9, 9, 9 + xxlor 1, 36, 36 + xxlxor 48, 43, 48 + xxlxor 36, 46, 39 + xxmrgld 39, 60, 5 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 28, 4, 9 + xxmrgld 36, 53, 57 + vrlw 15, 15, 9 + xxmrghd 57, 53, 57 + vrlw 18, 18, 9 + vadduwm 14, 14, 4 + xxlor 0, 36, 36 + xxmrgld 36, 49, 52 + vadduwm 2, 16, 2 + xxmrgld 49, 8, 26 + vadduwm 3, 28, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 54, 34, 54 + xxlxor 61, 35, 61 + xxlxor 33, 38, 33 + xxlxor 62, 44, 62 + vrlw 29, 29, 8 + vrlw 20, 1, 8 + xxmrgld 33, 55, 27 + vrlw 30, 30, 8 + vrlw 22, 22, 8 + vadduwm 11, 11, 7 + xxlor 5, 39, 39 + xxmrgld 39, 32, 58 + vadduwm 31, 31, 4 + vadduwm 11, 29, 11 + vadduwm 13, 13, 7 + vadduwm 14, 20, 14 + vadduwm 31, 30, 31 + vadduwm 13, 22, 13 + xxlor 28, 36, 36 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 60 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vadduwm 11, 11, 17 + vmr 28, 17 + xxmrghd 49, 32, 58 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 21, 4, 2 + vadduwm 3, 15, 3 + xxlxor 34, 38, 61 + xxlxor 61, 44, 52 + xxlxor 62, 53, 62 + xxlxor 54, 35, 54 + vrlw 20, 2, 10 + vrlw 29, 29, 10 + vrlw 0, 30, 10 + vrlw 30, 22, 10 + vadduwm 14, 14, 25 + vadduwm 31, 31, 1 + vadduwm 13, 13, 17 + vadduwm 11, 20, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 24 + xxlor 8, 56, 56 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 21 + vadduwm 3, 15, 3 + xxlxor 55, 38, 52 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 25, 51, 51 + vmr 26, 17 + xxlor 49, 3, 3 + xxlor 52, 1, 1 + xxlor 51, 2, 2 + vadduwm 14, 14, 17 + vadduwm 31, 31, 20 + vadduwm 13, 13, 19 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 29, 39, 39 + xxlor 59, 4, 4 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 53, 0, 0 + xxlor 39, 6, 6 + vadduwm 11, 11, 27 + vadduwm 14, 14, 21 + vadduwm 31, 31, 7 + vadduwm 13, 13, 1 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 34, 7, 7 + vadduwm 31, 31, 28 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 11, 11, 2 + xxlor 34, 28, 28 + vadduwm 13, 13, 26 + vadduwm 14, 14, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 2, 58, 58 + xxlor 39, 25, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 54, 29, 29 + xxlor 58, 5, 5 + vadduwm 11, 11, 25 + vadduwm 14, 14, 7 + vadduwm 31, 31, 22 + vadduwm 13, 13, 26 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 17 + vadduwm 14, 14, 21 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 31, 31, 1 + vadduwm 13, 13, 20 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 0, 33, 33 + xxlor 33, 8, 8 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vadduwm 11, 11, 19 + vadduwm 14, 14, 2 + vadduwm 31, 31, 1 + vadduwm 13, 13, 22 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + vadduwm 11, 11, 27 + vadduwm 14, 14, 28 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 31, 31, 25 + vadduwm 13, 13, 26 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 3, 7, 7 + vadduwm 11, 11, 7 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 33, 6, 6 + xxlor 58, 2, 2 + xxlor 39, 3, 3 + vadduwm 14, 14, 1 + vadduwm 31, 31, 26 + vadduwm 13, 13, 7 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + xxlor 52, 0, 0 + vadduwm 11, 11, 21 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 14, 14, 2 + vadduwm 31, 31, 22 + vadduwm 13, 13, 20 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 7, 49, 49 + vmr 17, 2 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 54, 1, 1 + xxlor 34, 7, 7 + vadduwm 11, 11, 22 + vadduwm 14, 14, 28 + vadduwm 31, 31, 2 + vadduwm 13, 13, 26 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 59, 25, 25 + vadduwm 11, 11, 19 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 14, 14, 25 + vadduwm 31, 31, 27 + vadduwm 13, 13, 7 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vmr 2, 19 + xxlor 0, 7, 7 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 1, 51, 51 + xxlor 7, 39, 39 + xxlor 51, 8, 8 + xxlor 39, 5, 5 + xxlor 34, 4, 4 + vadduwm 11, 11, 1 + vadduwm 14, 14, 19 + vadduwm 31, 31, 7 + vadduwm 13, 13, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + xxlor 2, 53, 53 + vmr 21, 28 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 53, 29, 29 + vadduwm 11, 11, 17 + vadduwm 14, 14, 28 + vadduwm 31, 31, 26 + vadduwm 13, 13, 21 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + vadduwm 11, 11, 20 + xxlor 5, 52, 52 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 52, 2, 2 + vadduwm 14, 14, 25 + vadduwm 31, 31, 20 + vadduwm 13, 13, 7 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + vadduwm 11, 11, 22 + vadduwm 14, 14, 27 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 31, 31, 1 + vadduwm 13, 13, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 3, 29, 29 + xxlor 4, 49, 49 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + vmr 17, 28 + xxlor 2, 54, 54 + xxlor 3, 34, 34 + xxlor 34, 8, 8 + xxlor 51, 0, 0 + xxlor 60, 7, 7 + xxlor 54, 1, 1 + vadduwm 11, 11, 2 + vadduwm 14, 14, 19 + vadduwm 31, 31, 28 + vadduwm 13, 13, 22 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 17 + vadduwm 14, 14, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 31, 31, 7 + vadduwm 13, 13, 26 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 6, 39, 39 + xxlor 39, 4, 4 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vadduwm 11, 11, 21 + vadduwm 14, 14, 27 + vadduwm 31, 31, 7 + vadduwm 13, 13, 28 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 0, 49, 49 + xxlor 49, 5, 5 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 11, 11, 17 + vadduwm 14, 14, 1 + vadduwm 31, 31, 2 + vadduwm 13, 13, 22 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 34, 3, 3 + xxlor 49, 2, 2 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + vadduwm 11, 11, 19 + vadduwm 14, 14, 20 + vadduwm 31, 31, 2 + vadduwm 13, 13, 17 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 14, 14, 27 + vadduwm 11, 11, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 27, 4, 24 + vadduwm 3, 15, 3 + xxlxor 57, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 59, 32 + xxlor 39, 7, 7 + vrlw 30, 30, 8 + vrlw 25, 25, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 1, 58, 58 + vmr 26, 19 + vadduwm 19, 31, 7 + xxlor 39, 6, 6 + vadduwm 11, 30, 11 + vadduwm 7, 13, 7 + vadduwm 13, 25, 14 + vadduwm 14, 29, 19 + vadduwm 7, 0, 7 + xxlxor 48, 43, 48 + xxlxor 36, 45, 36 + xxlxor 47, 46, 47 + xxlxor 50, 39, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 51, 1, 1 + vadduwm 13, 13, 1 + vadduwm 11, 11, 19 + vadduwm 19, 16, 27 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 63, 51, 62 + xxlxor 62, 35, 57 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 31, 31, 10 + vrlw 30, 30, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 33, 0, 0 + vadduwm 7, 7, 2 + vadduwm 14, 14, 1 + vadduwm 11, 31, 11 + vadduwm 13, 30, 13 + vadduwm 14, 29, 14 + vadduwm 7, 0, 7 + xxlxor 48, 43, 48 + xxlxor 36, 45, 36 + xxlxor 47, 46, 47 + xxlxor 50, 39, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 60, 8, 8 + vadduwm 1, 11, 21 + vadduwm 11, 13, 28 + vadduwm 13, 16, 19 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 51, 45, 63 + xxlxor 63, 35, 62 + xxlxor 62, 38, 61 + xxlxor 32, 44, 32 + vrlw 31, 31, 8 + vrlw 30, 30, 8 + vrlw 0, 0, 8 + vrlw 19, 19, 8 + vadduwm 14, 14, 26 + vadduwm 7, 7, 17 + vadduwm 1, 31, 1 + vadduwm 11, 30, 11 + vadduwm 14, 0, 14 + vadduwm 7, 19, 7 + xxlxor 50, 33, 50 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 39, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 34, 4, 4 + vadduwm 14, 14, 22 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 13, 4, 13 + vadduwm 3, 15, 3 + xxlxor 49, 38, 63 + xxlxor 63, 44, 62 + xxlxor 32, 45, 32 + xxlxor 51, 35, 51 + vrlw 17, 17, 10 + vrlw 31, 31, 10 + vrlw 0, 0, 10 + vrlw 10, 19, 10 + vadduwm 11, 11, 2 + xxlor 34, 5, 5 + vadduwm 1, 1, 20 + vadduwm 2, 7, 2 + vadduwm 7, 31, 11 + vadduwm 11, 0, 14 + vadduwm 2, 10, 2 + vadduwm 1, 17, 1 + xxlxor 36, 43, 36 + xxlxor 46, 34, 47 + vrlw 4, 4, 9 + vrlw 14, 14, 9 + xxlxor 47, 33, 50 + xxlxor 48, 39, 48 + vrlw 15, 15, 9 + vrlw 9, 16, 9 + vadduwm 13, 4, 13 + vadduwm 3, 14, 3 + xxlxor 32, 45, 32 + xxlxor 45, 45, 33 + xxlxor 33, 35, 42 + xxlxor 59, 35, 39 + vadduwm 3, 15, 6 + vadduwm 6, 9, 12 + xxlxor 39, 35, 49 + xxlxor 42, 38, 63 + vrlw 1, 1, 8 + vrlw 7, 7, 8 + vrlw 10, 10, 8 + vrlw 0, 0, 8 + xxlxor 40, 35, 43 + xxlxor 38, 38, 34 + xxlxor 61, 33, 41 + xxlxor 50, 39, 36 + xxlxor 62, 42, 46 + xxlxor 54, 32, 47 + bne 0, .LBB3_2 +.LBB3_5: + vmrglw 2, 27, 13 + li 3, 32 + li 4, 48 + vmrglw 4, 6, 8 + vmrglw 0, 18, 29 + vmrglw 1, 22, 30 + vmrghw 3, 27, 13 + vmrghw 5, 6, 8 + vmrghw 6, 18, 29 + vmrghw 7, 22, 30 + xxmrgld 40, 36, 34 + xxmrghd 34, 36, 34 + xxmrgld 41, 33, 32 + xxswapd 0, 40 + xxmrgld 36, 37, 35 + xxmrghd 35, 37, 35 + xxmrghd 37, 33, 32 + xxswapd 1, 41 + xxmrgld 32, 39, 38 + xxmrghd 33, 39, 38 + xxswapd 2, 34 + xxswapd 4, 36 + xxswapd 3, 37 + stxvd2x 0, 0, 5 + xxswapd 5, 32 + stxvd2x 1, 5, 11 + xxswapd 0, 35 + xxswapd 1, 33 + stxvd2x 2, 5, 3 + li 3, 64 + stxvd2x 3, 5, 4 + li 4, 80 + stxvd2x 4, 5, 3 + li 3, 96 + stxvd2x 5, 5, 4 + li 4, 112 + stxvd2x 0, 5, 3 + stxvd2x 1, 5, 4 + li 3, 224 + lxvd2x 63, 1, 3 + li 3, 208 + lfd 31, 392(1) + ld 30, 312(1) + ld 29, 304(1) + lxvd2x 62, 1, 3 + li 3, 192 + lfd 30, 384(1) + ld 28, 296(1) + ld 27, 288(1) + lxvd2x 61, 1, 3 + li 3, 176 + lfd 29, 376(1) + ld 26, 280(1) + ld 25, 272(1) + lxvd2x 60, 1, 3 + li 3, 160 + lfd 28, 368(1) + ld 24, 264(1) + ld 23, 256(1) + lxvd2x 59, 1, 3 + li 3, 144 + lfd 27, 360(1) + ld 22, 248(1) + lxvd2x 58, 1, 3 + li 3, 128 + lfd 26, 352(1) + lxvd2x 57, 1, 3 + li 3, 112 + lfd 25, 344(1) + lxvd2x 56, 1, 3 + li 3, 96 + lfd 24, 336(1) + lxvd2x 55, 1, 3 + li 3, 80 + lfd 23, 328(1) + lxvd2x 54, 1, 3 + li 3, 64 + lxvd2x 53, 1, 3 + li 3, 48 + lxvd2x 52, 1, 3 + addi 1, 1, 400 + blr + .long 0 + .quad 0 +.Lfunc_end3: + .size blake3_hash4_sse2, .Lfunc_end3-.Lfunc_begin3 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S new file mode 100644 index 000000000..a8b2627f1 --- /dev/null +++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S @@ -0,0 +1,3064 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE4.1 -> POWER8 PPC64 Little Endian + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + .text + .abiversion 2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 31 + .byte 14 + .byte 13 + .byte 12 + .byte 30 + .byte 10 + .byte 9 + .byte 8 + .byte 29 + .byte 6 + .byte 5 + .byte 4 + .byte 28 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI0_2: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI0_3: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI0_4: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_5: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI0_6: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 +.LCPI0_7: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_8: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_9: + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_10: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 31 + .byte 31 + .byte 31 +.LCPI0_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_12: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_13: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_14: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: +.Lfunc_begin0: + .cfi_startproc +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry zfs_blake3_compress_in_place_sse41, .Lfunc_lep0-.Lfunc_gep0 + li 8, -64 + mtvsrd 34, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 13, -16 + stxvd2x 60, 1, 8 + li 8, -48 + mtvsrd 35, 7 + lfd 2, 16(4) + lfd 3, 24(4) + addis 7, 2, .LCPI0_0@toc@ha + stxvd2x 61, 1, 8 + li 8, -32 + mtvsrwz 36, 6 + rldicl 6, 6, 32, 32 + stxvd2x 62, 1, 8 + li 8, -16 + vmrghb 2, 3, 2 + stxvd2x 63, 1, 8 + mtvsrwz 35, 6 + addi 6, 7, .LCPI0_0@toc@l + addis 7, 2, .LCPI0_2@toc@ha + lfd 1, 8(4) + xxmrghd 32, 3, 2 + lvx 6, 0, 6 + xxlxor 33, 33, 33 + addis 6, 2, .LCPI0_1@toc@ha + addi 7, 7, .LCPI0_2@toc@l + vmrghw 3, 3, 4 + addi 6, 6, .LCPI0_1@toc@l + vspltisw 14, 9 + xxmrghd 37, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + vperm 2, 1, 2, 6 + vpkudum 9, 0, 5 + xxswapd 36, 0 + xxswapd 38, 1 + xxmrgld 34, 34, 35 + lvx 3, 0, 7 + addis 7, 2, .LCPI0_4@toc@ha + addi 7, 7, .LCPI0_4@toc@l + vadduwm 4, 9, 4 + lvx 11, 0, 7 + addis 7, 2, .LCPI0_6@toc@ha + addi 7, 7, .LCPI0_6@toc@l + vadduwm 7, 4, 6 + lvx 4, 0, 6 + addis 6, 2, .LCPI0_3@toc@ha + addi 6, 6, .LCPI0_3@toc@l + vperm 11, 0, 5, 11 + lvx 0, 0, 7 + li 7, 48 + xxlxor 40, 39, 34 + lvx 10, 0, 6 + addis 6, 2, .LCPI0_5@toc@ha + lxvd2x 1, 4, 7 + vcmpgtsb 2, 1, 4 + addi 6, 6, .LCPI0_5@toc@l + vperm 4, 8, 8, 3 + vspltisw 8, 10 + xxlandc 44, 36, 34 + vadduwm 4, 8, 8 + vadduwm 8, 12, 10 + xxlxor 37, 40, 38 + vrlw 6, 5, 4 + vadduwm 5, 7, 11 + vadduwm 7, 6, 5 + lvx 5, 0, 6 + li 6, 32 + lxvd2x 0, 4, 6 + addis 4, 2, .LCPI0_7@toc@ha + addis 6, 2, .LCPI0_9@toc@ha + xxlxor 42, 39, 44 + xxswapd 44, 1 + addi 4, 4, .LCPI0_7@toc@l + addi 6, 6, .LCPI0_9@toc@l + vcmpgtsb 5, 1, 5 + vperm 1, 10, 10, 0 + xxswapd 42, 0 + vpkudum 16, 12, 10 + xxlandc 47, 33, 37 + vsubuwm 1, 14, 13 + lvx 14, 0, 4 + addis 4, 2, .LCPI0_8@toc@ha + vadduwm 8, 15, 8 + xxswapd 45, 47 + addi 4, 4, .LCPI0_8@toc@l + vadduwm 7, 7, 16 + xxsldwi 48, 48, 48, 1 + xxlxor 38, 40, 38 + xxsldwi 40, 40, 40, 3 + xxsldwi 39, 39, 39, 1 + vperm 14, 10, 12, 14 + vrlw 6, 6, 1 + vadduwm 7, 6, 7 + xxlxor 45, 39, 45 + vperm 13, 13, 13, 3 + xxlandc 45, 45, 34 + vadduwm 8, 13, 8 + xxlxor 38, 40, 38 + vrlw 10, 6, 4 + vadduwm 6, 7, 14 + vadduwm 7, 10, 6 + xxlxor 38, 39, 45 + vperm 12, 6, 6, 0 + lvx 6, 0, 4 + addis 4, 2, .LCPI0_10@toc@ha + addi 4, 4, .LCPI0_10@toc@l + vperm 13, 11, 9, 6 + xxlandc 44, 44, 37 + vadduwm 15, 12, 8 + vadduwm 7, 7, 13 + xxsldwi 45, 45, 45, 3 + xxlxor 40, 47, 42 + xxsldwi 47, 47, 47, 1 + xxsldwi 39, 39, 39, 3 + vrlw 10, 8, 1 + xxswapd 40, 44 + vadduwm 17, 10, 7 + lvx 7, 0, 4 + addis 4, 2, .LCPI0_11@toc@ha + addi 4, 4, .LCPI0_11@toc@l + xxlxor 44, 49, 40 + lvx 8, 0, 6 + vperm 18, 9, 9, 7 + lvx 9, 0, 4 + addis 4, 2, .LCPI0_12@toc@ha + vperm 12, 12, 12, 3 + addi 4, 4, .LCPI0_12@toc@l + vperm 19, 14, 16, 8 + xxlandc 63, 44, 34 + vperm 12, 19, 18, 9 + vadduwm 15, 31, 15 + xxlxor 42, 47, 42 + vrlw 18, 10, 4 + vadduwm 10, 17, 12 + vadduwm 17, 18, 10 + xxlxor 42, 49, 63 + xxmrgld 63, 43, 46 + xxsldwi 49, 49, 49, 1 + vmrghw 14, 14, 11 + vperm 19, 10, 10, 0 + lvx 10, 0, 4 + addis 4, 2, .LCPI0_13@toc@ha + addi 4, 4, .LCPI0_13@toc@l + lvx 11, 0, 4 + addis 4, 2, .LCPI0_14@toc@ha + vperm 31, 16, 31, 10 + addi 4, 4, .LCPI0_14@toc@l + vperm 14, 14, 16, 11 + xxlandc 51, 51, 37 + vadduwm 15, 19, 15 + xxswapd 51, 51 + vadduwm 17, 17, 31 + xxlxor 50, 47, 50 + xxsldwi 47, 47, 47, 3 + vperm 30, 14, 31, 8 + vrlw 18, 18, 1 + vadduwm 17, 18, 17 + xxlxor 51, 49, 51 + vadduwm 17, 17, 14 + vperm 19, 19, 19, 3 + xxlandc 51, 51, 34 + vadduwm 15, 19, 15 + xxlxor 48, 47, 50 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 51 + vperm 19, 12, 13, 6 + vperm 18, 18, 18, 0 + vperm 13, 13, 13, 7 + vadduwm 17, 17, 19 + xxlandc 50, 50, 37 + xxsldwi 49, 49, 49, 3 + vperm 13, 30, 13, 9 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxmrgld 62, 44, 46 + vmrghw 12, 14, 12 + xxlxor 48, 47, 48 + xxsldwi 47, 47, 47, 1 + vrlw 16, 16, 1 + vperm 30, 31, 30, 10 + vperm 12, 12, 31, 11 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 13 + vperm 18, 18, 18, 3 + vperm 31, 12, 30, 8 + xxlandc 50, 50, 34 + vadduwm 15, 18, 15 + xxlxor 48, 47, 48 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + xxsldwi 49, 49, 49, 1 + vperm 18, 18, 18, 0 + vadduwm 17, 17, 30 + xxlandc 50, 50, 37 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxlxor 48, 47, 48 + xxsldwi 46, 47, 47, 3 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 12 + vperm 18, 18, 18, 3 + xxlandc 47, 50, 34 + xxsldwi 50, 51, 51, 3 + vadduwm 14, 15, 14 + vperm 19, 13, 18, 6 + xxlxor 48, 46, 48 + vperm 18, 18, 18, 7 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vadduwm 17, 17, 19 + vperm 15, 15, 15, 0 + xxsldwi 49, 49, 49, 3 + xxlandc 47, 47, 37 + vadduwm 14, 15, 14 + xxswapd 47, 47 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 29, 15, 14 + vperm 14, 31, 18, 9 + xxmrgld 50, 45, 44 + xxlxor 48, 61, 48 + vmrghw 12, 12, 13 + vrlw 16, 16, 4 + vperm 18, 30, 18, 10 + vadduwm 17, 17, 14 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + xxsldwi 49, 49, 49, 1 + vperm 15, 15, 15, 0 + vadduwm 17, 17, 18 + xxlandc 47, 47, 37 + vadduwm 31, 15, 29 + xxswapd 47, 47 + xxlxor 48, 63, 48 + xxsldwi 45, 63, 63, 3 + vperm 31, 12, 30, 11 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 13, 15, 13 + xxlxor 44, 45, 48 + vadduwm 16, 17, 31 + xxsldwi 49, 51, 51, 3 + vrlw 12, 12, 4 + vperm 19, 14, 17, 6 + vadduwm 16, 12, 16 + xxlxor 47, 48, 47 + vperm 15, 15, 15, 0 + xxlandc 47, 47, 37 + vadduwm 13, 15, 13 + xxswapd 47, 47 + xxlxor 44, 45, 44 + xxsldwi 45, 45, 45, 1 + vrlw 30, 12, 1 + vadduwm 12, 16, 19 + xxsldwi 44, 44, 44, 3 + vadduwm 16, 30, 12 + xxlxor 44, 48, 47 + vperm 15, 17, 17, 7 + vperm 12, 12, 12, 3 + vperm 17, 31, 18, 8 + xxlandc 61, 44, 34 + vperm 12, 17, 15, 9 + vadduwm 13, 29, 13 + xxlxor 47, 45, 62 + xxmrgld 62, 46, 63 + vmrghw 14, 31, 14 + vrlw 15, 15, 4 + vadduwm 16, 16, 12 + vperm 30, 18, 30, 10 + vperm 14, 14, 18, 11 + xxsldwi 50, 51, 51, 3 + vadduwm 16, 15, 16 + xxlxor 49, 48, 61 + xxsldwi 48, 48, 48, 1 + vperm 19, 12, 18, 6 + vperm 17, 17, 17, 0 + vadduwm 16, 16, 30 + xxmrgld 60, 44, 46 + vmrghw 12, 14, 12 + vperm 28, 30, 28, 10 + xxlandc 49, 49, 37 + vadduwm 13, 17, 13 + xxswapd 49, 49 + vperm 12, 12, 30, 11 + xxlxor 47, 45, 47 + xxsldwi 45, 45, 45, 3 + vrlw 15, 15, 1 + vperm 8, 12, 28, 8 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vadduwm 16, 16, 14 + vperm 17, 17, 17, 3 + xxlandc 49, 49, 34 + vadduwm 13, 17, 13 + xxlxor 47, 45, 47 + vrlw 15, 15, 4 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vperm 17, 17, 17, 0 + xxlandc 49, 49, 37 + vadduwm 31, 17, 13 + xxlxor 45, 63, 47 + vrlw 15, 13, 1 + vadduwm 13, 16, 19 + xxswapd 48, 49 + xxsldwi 51, 51, 51, 3 + xxsldwi 45, 45, 45, 3 + vadduwm 17, 15, 13 + xxlxor 45, 49, 48 + lvx 16, 0, 4 + vperm 29, 13, 13, 3 + vperm 13, 18, 18, 7 + xxsldwi 50, 63, 63, 1 + vperm 16, 14, 30, 16 + vperm 7, 19, 19, 7 + xxlandc 63, 61, 34 + vadduwm 18, 31, 18 + vperm 29, 16, 13, 9 + xxlxor 47, 50, 47 + vperm 6, 16, 19, 6 + vrlw 15, 15, 4 + vperm 7, 8, 7, 9 + vadduwm 17, 17, 29 + xxmrgld 41, 61, 44 + vadduwm 17, 15, 17 + vperm 9, 28, 9, 10 + xxlxor 63, 49, 63 + xxsldwi 49, 49, 49, 1 + vperm 31, 31, 31, 0 + vadduwm 17, 17, 28 + xxlandc 63, 63, 37 + vadduwm 18, 31, 18 + xxswapd 63, 63 + xxlxor 47, 50, 47 + xxsldwi 46, 50, 50, 3 + vrlw 15, 15, 1 + vadduwm 17, 15, 17 + xxlxor 63, 49, 63 + vadduwm 17, 17, 12 + vperm 31, 31, 31, 3 + xxlandc 50, 63, 34 + vadduwm 14, 18, 14 + xxlxor 47, 46, 47 + vrlw 15, 15, 4 + vadduwm 17, 15, 17 + xxlxor 50, 49, 50 + vadduwm 6, 17, 6 + vperm 18, 18, 18, 0 + xxsldwi 38, 38, 38, 3 + xxlandc 50, 50, 37 + vadduwm 14, 18, 14 + xxswapd 48, 50 + xxlxor 47, 46, 47 + xxsldwi 46, 46, 46, 1 + vrlw 15, 15, 1 + vadduwm 6, 15, 6 + xxlxor 48, 38, 48 + vadduwm 6, 6, 7 + vperm 16, 16, 16, 3 + xxlandc 48, 48, 34 + vadduwm 14, 16, 14 + xxlxor 40, 46, 47 + vrlw 8, 8, 4 + vadduwm 6, 8, 6 + xxlxor 39, 38, 48 + xxsldwi 38, 38, 38, 1 + vperm 7, 7, 7, 0 + vadduwm 6, 6, 9 + xxlandc 39, 39, 37 + vadduwm 14, 7, 14 + xxswapd 39, 39 + xxlxor 40, 46, 40 + xxsldwi 41, 46, 46, 3 + vrlw 8, 8, 1 + vadduwm 6, 8, 6 + xxlxor 39, 38, 39 + vperm 3, 7, 7, 3 + vmrghw 7, 12, 13 + xxlandc 34, 35, 34 + vperm 7, 7, 28, 11 + vadduwm 3, 2, 9 + xxlxor 40, 35, 40 + vrlw 4, 8, 4 + vadduwm 6, 6, 7 + vadduwm 6, 4, 6 + xxlxor 34, 38, 34 + xxsldwi 0, 38, 38, 3 + vperm 2, 2, 2, 0 + xxlandc 34, 34, 37 + vadduwm 3, 2, 3 + xxswapd 34, 34 + xxlxor 36, 35, 36 + xxsldwi 1, 35, 35, 1 + vrlw 4, 4, 1 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 1, 36, 34 + stxvd2x 0, 0, 3 + xxswapd 1, 1 + stxvd2x 1, 3, 5 + li 3, -16 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-.Lfunc_begin0 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 31 + .byte 14 + .byte 13 + .byte 12 + .byte 30 + .byte 10 + .byte 9 + .byte 8 + .byte 29 + .byte 6 + .byte 5 + .byte 4 + .byte 28 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI1_2: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI1_3: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI1_4: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_5: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI1_6: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 +.LCPI1_7: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_8: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_9: + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_10: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 31 + .byte 31 + .byte 31 +.LCPI1_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_12: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_13: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_14: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .text + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: +.Lfunc_begin1: + .cfi_startproc +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry zfs_blake3_compress_xof_sse41, .Lfunc_lep1-.Lfunc_gep1 + li 9, -64 + mtvsrd 34, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 13, -16 + addis 11, 2, .LCPI1_9@toc@ha + stxvd2x 60, 1, 9 + li 9, -48 + mtvsrd 35, 7 + lfd 1, 8(4) + lfd 2, 16(4) + addis 7, 2, .LCPI1_0@toc@ha + stxvd2x 61, 1, 9 + li 9, -32 + mtvsrwz 36, 6 + rldicl 6, 6, 32, 32 + stxvd2x 62, 1, 9 + li 9, -16 + vmrghb 2, 3, 2 + stxvd2x 63, 1, 9 + mtvsrwz 35, 6 + addi 6, 7, .LCPI1_0@toc@l + addis 7, 2, .LCPI1_2@toc@ha + lfd 3, 24(4) + xxmrghd 37, 1, 0 + lvx 6, 0, 6 + xxlxor 33, 33, 33 + lxvd2x 0, 0, 3 + addis 6, 2, .LCPI1_1@toc@ha + addi 7, 7, .LCPI1_2@toc@l + vmrghw 3, 3, 4 + lxvd2x 1, 3, 5 + addi 6, 6, .LCPI1_1@toc@l + vspltisw 14, 9 + xxmrghd 32, 3, 2 + xxswapd 36, 0 + vperm 2, 1, 2, 6 + xxswapd 38, 1 + vpkudum 9, 0, 5 + xxmrgld 34, 34, 35 + lvx 3, 0, 7 + addis 7, 2, .LCPI1_4@toc@ha + addi 7, 7, .LCPI1_4@toc@l + vadduwm 4, 9, 4 + lvx 11, 0, 7 + addis 7, 2, .LCPI1_6@toc@ha + addi 7, 7, .LCPI1_6@toc@l + vadduwm 7, 4, 6 + lvx 4, 0, 6 + addis 6, 2, .LCPI1_3@toc@ha + addi 6, 6, .LCPI1_3@toc@l + vperm 11, 0, 5, 11 + lvx 0, 0, 7 + li 7, 32 + xxlxor 40, 39, 34 + lvx 10, 0, 6 + addis 6, 2, .LCPI1_5@toc@ha + lxvd2x 0, 4, 7 + vcmpgtsb 2, 1, 4 + addi 6, 6, .LCPI1_5@toc@l + vperm 4, 8, 8, 3 + vspltisw 8, 10 + xxlandc 44, 36, 34 + vadduwm 4, 8, 8 + vadduwm 8, 12, 10 + xxlxor 37, 40, 38 + vrlw 6, 5, 4 + vadduwm 5, 7, 11 + vadduwm 7, 6, 5 + lvx 5, 0, 6 + li 6, 48 + lxvd2x 1, 4, 6 + addis 4, 2, .LCPI1_7@toc@ha + xxlxor 42, 39, 44 + addi 4, 4, .LCPI1_7@toc@l + vcmpgtsb 5, 1, 5 + vperm 1, 10, 10, 0 + xxswapd 42, 0 + xxswapd 44, 1 + vpkudum 16, 12, 10 + xxlandc 47, 33, 37 + vsubuwm 1, 14, 13 + lvx 14, 0, 4 + addis 4, 2, .LCPI1_8@toc@ha + vadduwm 8, 15, 8 + xxswapd 45, 47 + addi 4, 4, .LCPI1_8@toc@l + xxlxor 38, 40, 38 + xxsldwi 40, 40, 40, 3 + vadduwm 7, 7, 16 + xxsldwi 48, 48, 48, 1 + vrlw 6, 6, 1 + xxsldwi 39, 39, 39, 1 + vperm 14, 10, 12, 14 + vadduwm 7, 6, 7 + xxlxor 45, 39, 45 + vperm 13, 13, 13, 3 + xxlandc 45, 45, 34 + vadduwm 8, 13, 8 + xxlxor 38, 40, 38 + vrlw 10, 6, 4 + vadduwm 6, 7, 14 + vadduwm 7, 10, 6 + xxlxor 38, 39, 45 + vperm 12, 6, 6, 0 + lvx 6, 0, 4 + addis 4, 2, .LCPI1_10@toc@ha + addi 4, 4, .LCPI1_10@toc@l + vperm 13, 11, 9, 6 + xxlandc 44, 44, 37 + vadduwm 15, 12, 8 + vadduwm 7, 7, 13 + xxsldwi 45, 45, 45, 3 + xxlxor 40, 47, 42 + xxsldwi 47, 47, 47, 1 + xxsldwi 39, 39, 39, 3 + vrlw 10, 8, 1 + xxswapd 40, 44 + vadduwm 17, 10, 7 + lvx 7, 0, 4 + addi 4, 11, .LCPI1_9@toc@l + xxlxor 44, 49, 40 + lvx 8, 0, 4 + addis 4, 2, .LCPI1_11@toc@ha + vperm 18, 9, 9, 7 + addi 4, 4, .LCPI1_11@toc@l + vperm 12, 12, 12, 3 + lvx 9, 0, 4 + addis 4, 2, .LCPI1_12@toc@ha + vperm 19, 14, 16, 8 + addi 4, 4, .LCPI1_12@toc@l + xxlandc 63, 44, 34 + vperm 12, 19, 18, 9 + vadduwm 15, 31, 15 + xxlxor 42, 47, 42 + vrlw 18, 10, 4 + vadduwm 10, 17, 12 + vadduwm 17, 18, 10 + xxlxor 42, 49, 63 + xxmrgld 63, 43, 46 + xxsldwi 49, 49, 49, 1 + vmrghw 14, 14, 11 + vperm 19, 10, 10, 0 + lvx 10, 0, 4 + addis 4, 2, .LCPI1_13@toc@ha + addi 4, 4, .LCPI1_13@toc@l + lvx 11, 0, 4 + addis 4, 2, .LCPI1_14@toc@ha + vperm 31, 16, 31, 10 + addi 4, 4, .LCPI1_14@toc@l + vperm 14, 14, 16, 11 + xxlandc 51, 51, 37 + vadduwm 15, 19, 15 + xxswapd 51, 51 + vadduwm 17, 17, 31 + xxlxor 50, 47, 50 + xxsldwi 47, 47, 47, 3 + vperm 30, 14, 31, 8 + vrlw 18, 18, 1 + vadduwm 17, 18, 17 + xxlxor 51, 49, 51 + vadduwm 17, 17, 14 + vperm 19, 19, 19, 3 + xxlandc 51, 51, 34 + vadduwm 15, 19, 15 + xxlxor 48, 47, 50 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 51 + vperm 19, 12, 13, 6 + vperm 18, 18, 18, 0 + vperm 13, 13, 13, 7 + vadduwm 17, 17, 19 + xxlandc 50, 50, 37 + xxsldwi 49, 49, 49, 3 + vperm 13, 30, 13, 9 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxmrgld 62, 44, 46 + vmrghw 12, 14, 12 + xxlxor 48, 47, 48 + xxsldwi 47, 47, 47, 1 + vrlw 16, 16, 1 + vperm 30, 31, 30, 10 + vperm 12, 12, 31, 11 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 13 + vperm 18, 18, 18, 3 + vperm 31, 12, 30, 8 + xxlandc 50, 50, 34 + vadduwm 15, 18, 15 + xxlxor 48, 47, 48 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + xxsldwi 49, 49, 49, 1 + vperm 18, 18, 18, 0 + vadduwm 17, 17, 30 + xxlandc 50, 50, 37 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxlxor 48, 47, 48 + xxsldwi 46, 47, 47, 3 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 12 + vperm 18, 18, 18, 3 + xxlandc 47, 50, 34 + xxsldwi 50, 51, 51, 3 + vadduwm 14, 15, 14 + vperm 19, 13, 18, 6 + xxlxor 48, 46, 48 + vperm 18, 18, 18, 7 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vadduwm 17, 17, 19 + vperm 15, 15, 15, 0 + xxsldwi 49, 49, 49, 3 + xxlandc 47, 47, 37 + vadduwm 14, 15, 14 + xxswapd 47, 47 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 29, 15, 14 + vperm 14, 31, 18, 9 + xxmrgld 50, 45, 44 + xxlxor 48, 61, 48 + vmrghw 12, 12, 13 + vrlw 16, 16, 4 + vperm 18, 30, 18, 10 + vadduwm 17, 17, 14 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + xxsldwi 49, 49, 49, 1 + vperm 15, 15, 15, 0 + vadduwm 17, 17, 18 + xxlandc 47, 47, 37 + vadduwm 31, 15, 29 + xxswapd 47, 47 + xxlxor 48, 63, 48 + xxsldwi 45, 63, 63, 3 + vperm 31, 12, 30, 11 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 13, 15, 13 + xxlxor 44, 45, 48 + vadduwm 16, 17, 31 + xxsldwi 49, 51, 51, 3 + vrlw 12, 12, 4 + vperm 19, 14, 17, 6 + vadduwm 16, 12, 16 + xxlxor 47, 48, 47 + vperm 15, 15, 15, 0 + xxlandc 47, 47, 37 + vadduwm 13, 15, 13 + xxswapd 47, 47 + xxlxor 44, 45, 44 + xxsldwi 45, 45, 45, 1 + vrlw 30, 12, 1 + vadduwm 12, 16, 19 + xxsldwi 44, 44, 44, 3 + vadduwm 16, 30, 12 + xxlxor 44, 48, 47 + vperm 15, 17, 17, 7 + vperm 12, 12, 12, 3 + vperm 17, 31, 18, 8 + xxlandc 61, 44, 34 + vperm 12, 17, 15, 9 + vadduwm 13, 29, 13 + xxlxor 47, 45, 62 + xxmrgld 62, 46, 63 + vmrghw 14, 31, 14 + vrlw 15, 15, 4 + vadduwm 16, 16, 12 + vperm 30, 18, 30, 10 + vperm 14, 14, 18, 11 + xxsldwi 50, 51, 51, 3 + vadduwm 16, 15, 16 + xxlxor 49, 48, 61 + xxsldwi 48, 48, 48, 1 + vperm 19, 12, 18, 6 + vperm 17, 17, 17, 0 + vadduwm 16, 16, 30 + xxmrgld 60, 44, 46 + vmrghw 12, 14, 12 + vperm 28, 30, 28, 10 + xxlandc 49, 49, 37 + vadduwm 13, 17, 13 + xxswapd 49, 49 + vperm 12, 12, 30, 11 + xxlxor 47, 45, 47 + xxsldwi 45, 45, 45, 3 + vrlw 15, 15, 1 + vperm 8, 12, 28, 8 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vadduwm 16, 16, 14 + vperm 17, 17, 17, 3 + xxlandc 49, 49, 34 + vadduwm 13, 17, 13 + xxlxor 47, 45, 47 + vrlw 15, 15, 4 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vperm 17, 17, 17, 0 + xxlandc 49, 49, 37 + vadduwm 31, 17, 13 + xxlxor 45, 63, 47 + vrlw 15, 13, 1 + vadduwm 13, 16, 19 + xxswapd 48, 49 + xxsldwi 51, 51, 51, 3 + xxsldwi 45, 45, 45, 3 + vadduwm 17, 15, 13 + xxlxor 45, 49, 48 + lvx 16, 0, 4 + vperm 29, 13, 13, 3 + vperm 13, 18, 18, 7 + xxsldwi 50, 63, 63, 1 + vperm 16, 14, 30, 16 + vperm 7, 19, 19, 7 + xxlandc 63, 61, 34 + vadduwm 18, 31, 18 + vperm 29, 16, 13, 9 + xxlxor 47, 50, 47 + vperm 6, 16, 19, 6 + vrlw 15, 15, 4 + vperm 7, 8, 7, 9 + vadduwm 17, 17, 29 + xxmrgld 41, 61, 44 + vadduwm 17, 15, 17 + vperm 9, 28, 9, 10 + xxlxor 63, 49, 63 + xxsldwi 49, 49, 49, 1 + vperm 31, 31, 31, 0 + vadduwm 17, 17, 28 + xxlandc 63, 63, 37 + vadduwm 18, 31, 18 + xxswapd 63, 63 + xxlxor 47, 50, 47 + xxsldwi 46, 50, 50, 3 + vrlw 15, 15, 1 + vadduwm 17, 15, 17 + xxlxor 63, 49, 63 + vadduwm 17, 17, 12 + vperm 31, 31, 31, 3 + xxlandc 50, 63, 34 + vadduwm 14, 18, 14 + xxlxor 47, 46, 47 + vrlw 15, 15, 4 + vadduwm 17, 15, 17 + xxlxor 50, 49, 50 + vadduwm 6, 17, 6 + vperm 18, 18, 18, 0 + xxsldwi 38, 38, 38, 3 + xxlandc 50, 50, 37 + vadduwm 14, 18, 14 + xxswapd 48, 50 + xxlxor 47, 46, 47 + xxsldwi 46, 46, 46, 1 + vrlw 15, 15, 1 + vadduwm 6, 15, 6 + xxlxor 48, 38, 48 + vadduwm 6, 6, 7 + vperm 16, 16, 16, 3 + xxlandc 48, 48, 34 + vadduwm 14, 16, 14 + xxlxor 40, 46, 47 + vrlw 8, 8, 4 + vadduwm 6, 8, 6 + xxlxor 39, 38, 48 + xxsldwi 38, 38, 38, 1 + vperm 7, 7, 7, 0 + vadduwm 6, 6, 9 + xxlandc 39, 39, 37 + vadduwm 14, 7, 14 + xxswapd 39, 39 + xxlxor 40, 46, 40 + xxsldwi 41, 46, 46, 3 + vrlw 8, 8, 1 + vadduwm 6, 8, 6 + xxlxor 39, 38, 39 + vperm 3, 7, 7, 3 + vmrghw 7, 12, 13 + xxlandc 34, 35, 34 + vperm 7, 7, 28, 11 + vadduwm 3, 2, 9 + xxlxor 40, 35, 40 + vrlw 4, 8, 4 + vadduwm 6, 6, 7 + vadduwm 6, 4, 6 + xxlxor 34, 38, 34 + xxsldwi 0, 38, 38, 3 + vperm 2, 2, 2, 0 + xxlandc 34, 34, 37 + vadduwm 3, 2, 3 + xxswapd 34, 34 + xxlxor 36, 35, 36 + xxsldwi 1, 35, 35, 1 + vrlw 4, 4, 1 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 2, 36, 34 + stxvd2x 0, 0, 8 + xxswapd 2, 2 + stxvd2x 2, 8, 5 + lfdx 0, 0, 3 + lfd 2, 8(3) + xxmrghd 35, 2, 0 + xxlxor 0, 1, 35 + xxswapd 0, 0 + stxvd2x 0, 8, 7 + lfd 0, 16(3) + lfd 1, 24(3) + li 3, -16 + xxmrghd 35, 1, 0 + xxlxor 0, 34, 35 + xxswapd 0, 0 + stxvd2x 0, 8, 6 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-.Lfunc_begin1 + .cfi_endproc + + .globl zfs_blake3_hash_many_sse41 + .p2align 2 + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: +.Lfunc_begin2: + .cfi_startproc +.Lfunc_gep2: + addis 2, 12, .TOC.-.Lfunc_gep2@ha + addi 2, 2, .TOC.-.Lfunc_gep2@l +.Lfunc_lep2: + .localentry zfs_blake3_hash_many_sse41, .Lfunc_lep2-.Lfunc_gep2 + mfocrf 12, 32 + mflr 0 + std 0, 16(1) + stw 12, 8(1) + stdu 1, -256(1) + .cfi_def_cfa_offset 256 + .cfi_offset lr, 16 + .cfi_offset r17, -120 + .cfi_offset r18, -112 + .cfi_offset r19, -104 + .cfi_offset r20, -96 + .cfi_offset r21, -88 + .cfi_offset r22, -80 + .cfi_offset r23, -72 + .cfi_offset r24, -64 + .cfi_offset r25, -56 + .cfi_offset r26, -48 + .cfi_offset r27, -40 + .cfi_offset r28, -32 + .cfi_offset r29, -24 + .cfi_offset r30, -16 + .cfi_offset cr2, 8 + std 26, 208(1) + mr 26, 4 + cmpldi 1, 4, 4 + andi. 4, 8, 1 + std 18, 144(1) + std 19, 152(1) + crmove 8, 1 + ld 19, 360(1) + lwz 18, 352(1) + std 24, 192(1) + std 25, 200(1) + std 27, 216(1) + std 28, 224(1) + mr 24, 10 + mr 28, 6 + mr 27, 5 + mr 25, 3 + std 29, 232(1) + std 30, 240(1) + mr 30, 9 + mr 29, 7 + std 17, 136(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + blt 1, .LBB2_3 + li 3, 0 + li 4, 1 + clrldi 23, 30, 32 + isel 22, 4, 3, 8 + clrldi 21, 24, 32 + clrldi 20, 18, 32 +.LBB2_2: + mr 3, 25 + mr 4, 27 + mr 5, 28 + mr 6, 29 + mr 7, 22 + mr 8, 23 + mr 9, 21 + mr 10, 20 + std 19, 32(1) + bl blake3_hash4_sse41 + addi 26, 26, -4 + addi 3, 29, 4 + addi 25, 25, 32 + addi 19, 19, 128 + cmpldi 26, 3 + isel 29, 3, 29, 8 + bgt 0, .LBB2_2 +.LBB2_3: + cmpldi 26, 0 + beq 0, .LBB2_11 + li 3, 0 + li 4, 1 + or 21, 24, 30 + li 20, 16 + addi 24, 1, 96 + isel 22, 4, 3, 8 +.LBB2_5: + lxvd2x 0, 28, 20 + ld 23, 0(25) + mr 17, 27 + mr 3, 21 + stxvd2x 0, 24, 20 + lxvd2x 0, 0, 28 + stxvd2x 0, 0, 24 +.LBB2_6: + cmpldi 17, 1 + beq 0, .LBB2_8 + cmpldi 17, 0 + bne 0, .LBB2_9 + b .LBB2_10 +.LBB2_8: + or 3, 3, 18 +.LBB2_9: + clrldi 7, 3, 56 + mr 3, 24 + mr 4, 23 + li 5, 64 + mr 6, 29 + bl zfs_blake3_compress_in_place_sse41 + addi 23, 23, 64 + addi 17, 17, -1 + mr 3, 30 + b .LBB2_6 +.LBB2_10: + lxvd2x 0, 24, 20 + addi 26, 26, -1 + add 29, 29, 22 + addi 25, 25, 8 + cmpldi 26, 0 + stxvd2x 0, 19, 20 + lxvd2x 0, 0, 24 + stxvd2x 0, 0, 19 + addi 19, 19, 32 + bne 0, .LBB2_5 +.LBB2_11: + ld 30, 240(1) + ld 29, 232(1) + ld 28, 224(1) + ld 27, 216(1) + ld 26, 208(1) + ld 25, 200(1) + ld 24, 192(1) + ld 23, 184(1) + ld 22, 176(1) + ld 21, 168(1) + ld 20, 160(1) + ld 19, 152(1) + ld 18, 144(1) + ld 17, 136(1) + addi 1, 1, 256 + ld 0, 16(1) + lwz 12, 8(1) + mtocrf 32, 12 + mtlr 0 + blr + .long 0 + .quad 0 +.Lfunc_end2: + .size zfs_blake3_hash_many_sse41, .Lfunc_end2-.Lfunc_begin2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI3_0: + .quad 4294967296 + .quad 12884901890 +.LCPI3_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI3_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI3_3: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI3_4: + .long 1779033703 + .long 1779033703 + .long 1779033703 + .long 1779033703 +.LCPI3_5: + .long 3144134277 + .long 3144134277 + .long 3144134277 + .long 3144134277 +.LCPI3_6: + .long 1013904242 + .long 1013904242 + .long 1013904242 + .long 1013904242 +.LCPI3_7: + .long 2773480762 + .long 2773480762 + .long 2773480762 + .long 2773480762 +.LCPI3_8: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .text + .p2align 2 + .type blake3_hash4_sse41,@function +blake3_hash4_sse41: +.Lfunc_begin3: + .cfi_startproc +.Lfunc_gep3: + addis 2, 12, .TOC.-.Lfunc_gep3@ha + addi 2, 2, .TOC.-.Lfunc_gep3@l +.Lfunc_lep3: + .localentry blake3_hash4_sse41, .Lfunc_lep3-.Lfunc_gep3 + stdu 1, -416(1) + .cfi_def_cfa_offset 416 + .cfi_offset r22, -176 + .cfi_offset r23, -168 + .cfi_offset r24, -160 + .cfi_offset r25, -152 + .cfi_offset r26, -144 + .cfi_offset r27, -136 + .cfi_offset r28, -128 + .cfi_offset r29, -120 + .cfi_offset r30, -112 + .cfi_offset f20, -96 + .cfi_offset f21, -88 + .cfi_offset f22, -80 + .cfi_offset f23, -72 + .cfi_offset f24, -64 + .cfi_offset f25, -56 + .cfi_offset f26, -48 + .cfi_offset f27, -40 + .cfi_offset f28, -32 + .cfi_offset f29, -24 + .cfi_offset f30, -16 + .cfi_offset f31, -8 + .cfi_offset v20, -368 + .cfi_offset v21, -352 + .cfi_offset v22, -336 + .cfi_offset v23, -320 + .cfi_offset v24, -304 + .cfi_offset v25, -288 + .cfi_offset v26, -272 + .cfi_offset v27, -256 + .cfi_offset v28, -240 + .cfi_offset v29, -224 + .cfi_offset v30, -208 + .cfi_offset v31, -192 + li 11, 48 + li 0, 8 + std 30, 304(1) + li 30, 12 + li 12, 4 + lfiwzx 0, 0, 5 + stxvd2x 52, 1, 11 + li 11, 64 + lfiwzx 2, 5, 0 + li 0, 20 + lfiwzx 3, 5, 30 + stxvd2x 53, 1, 11 + li 11, 80 + li 30, 24 + lfiwzx 4, 5, 0 + li 0, 28 + stxvd2x 54, 1, 11 + li 11, 96 + lfiwzx 1, 5, 12 + lfiwzx 6, 5, 30 + xxspltw 47, 0, 1 + cmpldi 4, 0 + std 22, 240(1) + stxvd2x 55, 1, 11 + li 11, 112 + lfiwzx 7, 5, 0 + xxspltw 40, 2, 1 + std 23, 248(1) + xxspltw 39, 3, 1 + std 24, 256(1) + std 25, 264(1) + xxspltw 51, 1, 1 + xxspltw 43, 6, 1 + std 26, 272(1) + xxspltw 41, 7, 1 + std 27, 280(1) + std 28, 288(1) + std 29, 296(1) + stxvd2x 56, 1, 11 + li 11, 128 + stfd 20, 320(1) + stxvd2x 57, 1, 11 + li 11, 144 + stfd 21, 328(1) + stxvd2x 58, 1, 11 + li 11, 160 + stfd 22, 336(1) + stxvd2x 59, 1, 11 + li 11, 176 + stfd 23, 344(1) + stxvd2x 60, 1, 11 + li 11, 192 + stfd 24, 352(1) + stxvd2x 61, 1, 11 + li 11, 208 + stfd 25, 360(1) + stxvd2x 62, 1, 11 + li 11, 224 + stfd 26, 368(1) + stxvd2x 63, 1, 11 + li 11, 16 + xxspltw 63, 4, 1 + lfiwzx 5, 5, 11 + ld 5, 448(1) + stfd 27, 376(1) + stfd 28, 384(1) + stfd 29, 392(1) + stfd 30, 400(1) + stfd 31, 408(1) + xxspltw 50, 5, 1 + beq 0, .LBB3_5 + addis 30, 2, .LCPI3_0@toc@ha + neg 7, 7 + xxleqv 34, 34, 34 + addis 28, 2, .LCPI3_5@toc@ha + addis 27, 2, .LCPI3_6@toc@ha + addis 26, 2, .LCPI3_7@toc@ha + addis 29, 2, .LCPI3_4@toc@ha + addis 25, 2, .LCPI3_8@toc@ha + addi 0, 30, .LCPI3_0@toc@l + mtfprwz 2, 7 + addis 7, 2, .LCPI3_1@toc@ha + addis 30, 2, .LCPI3_3@toc@ha + addi 24, 29, .LCPI3_4@toc@l + ld 29, 24(3) + lxvd2x 1, 0, 0 + mtfprwz 0, 6 + rldicl 6, 6, 32, 32 + addi 0, 30, .LCPI3_3@toc@l + ld 30, 16(3) + xxspltw 2, 2, 1 + vslw 2, 2, 2 + xxspltw 37, 0, 1 + mtfprwz 0, 6 + addi 6, 7, .LCPI3_1@toc@l + addis 7, 2, .LCPI3_2@toc@ha + xxswapd 35, 1 + xxlxor 36, 36, 36 + xxspltw 33, 0, 1 + xxland 35, 2, 35 + vadduwm 0, 3, 5 + lvx 5, 0, 6 + addi 6, 7, .LCPI3_2@toc@l + ld 7, 8(3) + xxlor 35, 35, 34 + xxlxor 34, 32, 34 + xxlor 9, 32, 32 + lvx 0, 0, 6 + ld 6, 0(3) + addi 3, 3, -8 + vcmpgtsw 2, 3, 2 + lvx 3, 0, 0 + addi 0, 28, .LCPI3_5@toc@l + addi 28, 27, .LCPI3_6@toc@l + addi 27, 26, .LCPI3_7@toc@l + addi 26, 25, .LCPI3_8@toc@l + or 25, 9, 8 + li 9, 0 + vcmpgtsb 5, 4, 5 + vcmpgtsb 0, 4, 0 + xxlor 11, 35, 35 + lvx 3, 0, 24 + xxlor 12, 35, 35 + vsubuwm 2, 1, 2 + xxlnor 10, 37, 37 + xxlor 13, 34, 34 + lvx 2, 0, 0 + li 0, 32 + xxlnor 31, 32, 32 + xxlor 30, 34, 34 + lvx 2, 0, 28 + li 28, 48 + xxlor 29, 34, 34 + lvx 2, 0, 27 + li 27, 0 + xxlor 28, 34, 34 + lvx 2, 0, 26 + xxlor 27, 34, 34 +.LBB3_2: + mr 26, 27 + addi 27, 27, 1 + xxlor 23, 39, 39 + cmpld 27, 4 + sldi 26, 26, 6 + xxlor 24, 40, 40 + iseleq 24, 10, 9 + add 23, 6, 26 + add 22, 30, 26 + lxvd2x 0, 6, 26 + lxvd2x 1, 7, 26 + or 25, 24, 25 + add 24, 7, 26 + lxvd2x 2, 30, 26 + lxvd2x 3, 29, 26 + xxlor 26, 47, 47 + lxvd2x 4, 23, 11 + lxvd2x 6, 24, 11 + clrlwi 25, 25, 24 + xxlor 25, 51, 51 + lxvd2x 7, 22, 11 + lxvd2x 8, 23, 0 + mtfprd 5, 25 + add 25, 29, 26 + xxswapd 34, 0 + lxvd2x 0, 25, 11 + xxswapd 38, 1 + xxswapd 32, 2 + lxvd2x 1, 24, 0 + lxvd2x 2, 22, 0 + xxswapd 40, 3 + xxswapd 39, 4 + lxvd2x 3, 25, 0 + lxvd2x 4, 23, 28 + xxswapd 60, 6 + xxswapd 47, 7 + lxvd2x 6, 24, 28 + xxswapd 57, 8 + lxvd2x 7, 22, 28 + lxvd2x 8, 25, 28 + xxswapd 58, 0 + mr 25, 3 + xxswapd 53, 1 + xxswapd 56, 2 + xxswapd 52, 3 + xxswapd 55, 4 + xxswapd 54, 6 + xxswapd 0, 5 + xxswapd 42, 7 + xxswapd 48, 8 + mtctr 12 +.LBB3_3: + ldu 24, 8(25) + add 24, 24, 26 + addi 24, 24, 256 + dcbt 0, 24 + bdnz .LBB3_3 + vmrgew 4, 28, 7 + vspltisw 14, 9 + mr 25, 8 + vmrgew 27, 6, 2 + vspltisw 17, 4 + vmrglw 12, 6, 2 + vspltisw 19, 10 + vmrghw 30, 6, 2 + xxspltw 0, 0, 3 + vmrglw 2, 8, 0 + vmrghw 13, 8, 0 + xxlor 7, 36, 36 + vmrgew 4, 21, 25 + vmrglw 29, 28, 7 + vmrghw 1, 28, 7 + vmrglw 28, 26, 15 + xxmrgld 37, 34, 44 + vmrgew 7, 26, 15 + vmrghw 15, 26, 15 + xxlor 21, 36, 36 + vmrglw 4, 21, 25 + vmrghw 21, 21, 25 + vmrglw 25, 20, 24 + xxmrgld 34, 60, 61 + vmrghw 26, 20, 24 + xxlor 38, 26, 26 + vmrgew 3, 8, 0 + xxlor 5, 36, 36 + vmrgew 4, 20, 24 + vspltisw 24, -16 + vmrglw 20, 22, 23 + xxmrgld 57, 57, 5 + vmrglw 8, 16, 10 + vmrghw 0, 16, 10 + vadduwm 12, 19, 19 + xxlor 8, 37, 37 + xxlor 20, 36, 36 + vmrgew 4, 22, 23 + vmrghw 23, 22, 23 + xxmrgld 40, 40, 52 + vmrgew 22, 16, 10 + vsubuwm 10, 14, 24 + vslw 14, 17, 17 + vadduwm 17, 5, 6 + xxmrgld 37, 47, 33 + xxlor 22, 36, 36 + xxmrgld 36, 45, 62 + xxlor 38, 25, 25 + xxlor 2, 34, 34 + vadduwm 19, 4, 6 + xxmrgld 38, 39, 7 + xxlor 3, 36, 36 + xxmrghd 39, 47, 33 + xxlor 36, 24, 24 + xxmrgld 33, 58, 53 + vadduwm 17, 17, 18 + vadduwm 29, 2, 4 + xxmrgld 36, 35, 59 + xxlor 34, 23, 23 + xxmrghd 35, 45, 62 + xxlor 1, 9, 9 + vadduwm 28, 5, 2 + xxlor 1, 13, 13 + vadduwm 19, 19, 31 + vadduwm 24, 29, 11 + vadduwm 28, 28, 9 + xxlxor 61, 49, 9 + xxlor 1, 41, 41 + xxlor 41, 11, 11 + xxlxor 34, 51, 13 + vperm 29, 29, 29, 9 + xxlxor 46, 56, 46 + vperm 2, 2, 2, 9 + xxlxor 59, 60, 0 + vperm 14, 14, 14, 9 + vperm 30, 27, 27, 9 + vadduwm 19, 19, 3 + xxlor 4, 35, 35 + xxland 61, 61, 10 + xxlor 35, 12, 12 + xxland 34, 34, 10 + vadduwm 27, 29, 3 + xxlor 35, 30, 30 + vadduwm 17, 17, 4 + xxlor 26, 36, 36 + xxland 46, 46, 10 + vadduwm 3, 2, 3 + xxlor 36, 29, 29 + xxland 62, 62, 10 + xxlxor 45, 59, 50 + xxlxor 50, 35, 63 + vadduwm 31, 14, 4 + xxlor 36, 28, 28 + xxlor 6, 37, 37 + vadduwm 16, 30, 4 + xxlxor 43, 63, 43 + xxlxor 37, 48, 1 + vrlw 4, 13, 12 + vrlw 18, 18, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 15, 24, 6 + vadduwm 28, 28, 7 + vadduwm 17, 4, 17 + vadduwm 19, 18, 19 + vadduwm 15, 11, 15 + vadduwm 28, 5, 28 + xxlor 25, 38, 38 + xxlxor 61, 49, 61 + xxlxor 34, 51, 34 + xxlxor 46, 47, 46 + xxlxor 62, 60, 62 + xxlor 38, 27, 27 + vadduwm 19, 19, 1 + vperm 29, 29, 29, 6 + vperm 2, 2, 2, 6 + vperm 24, 14, 14, 6 + vperm 30, 30, 30, 6 + xxlor 5, 33, 33 + vadduwm 17, 17, 25 + xxland 61, 61, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + xxland 62, 62, 31 + vadduwm 27, 29, 27 + vadduwm 3, 2, 3 + vadduwm 31, 24, 31 + vadduwm 16, 30, 16 + xxlxor 36, 59, 36 + xxlxor 50, 35, 50 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 18, 10 + xxmrgld 50, 32, 55 + vrlw 11, 11, 10 + xxmrghd 55, 32, 55 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 15, 15, 8 + vadduwm 28, 28, 18 + vadduwm 17, 1, 17 + vadduwm 19, 11, 19 + vadduwm 15, 5, 15 + vadduwm 28, 4, 28 + xxlor 7, 57, 57 + xxlxor 62, 49, 62 + xxlxor 61, 51, 61 + xxlxor 57, 47, 34 + xxlxor 34, 60, 56 + vperm 24, 30, 30, 9 + xxmrgld 62, 20, 21 + vperm 29, 29, 29, 9 + vperm 25, 25, 25, 9 + vperm 2, 2, 2, 9 + vmr 14, 8 + xxmrghd 40, 58, 53 + xxmrgld 58, 54, 22 + vadduwm 17, 17, 30 + xxland 56, 56, 10 + vadduwm 21, 19, 8 + xxland 61, 61, 10 + xxland 51, 57, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vadduwm 0, 15, 26 + vadduwm 15, 28, 23 + vadduwm 17, 1, 17 + vadduwm 28, 11, 21 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vmr 13, 8 + xxlor 53, 3, 3 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 52, 4, 4 + xxlor 40, 2, 2 + vadduwm 17, 17, 21 + vadduwm 28, 28, 20 + vadduwm 0, 0, 7 + vadduwm 15, 15, 8 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + vmr 25, 26 + xxlor 3, 39, 39 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 54, 6, 6 + xxlor 58, 5, 5 + xxlor 39, 8, 8 + vadduwm 17, 17, 22 + vadduwm 28, 28, 26 + vadduwm 0, 0, 7 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 39, 26, 26 + vadduwm 28, 28, 14 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 7 + vadduwm 0, 0, 30 + vadduwm 15, 15, 23 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 24, 55, 55 + vadduwm 17, 17, 13 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vmr 23, 13 + xxlor 45, 25, 25 + xxlor 39, 7, 7 + vadduwm 28, 28, 13 + vadduwm 0, 0, 18 + vadduwm 15, 15, 7 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 2, 46, 46 + xxlor 46, 3, 3 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vadduwm 17, 17, 20 + vadduwm 28, 28, 26 + vadduwm 0, 0, 25 + vadduwm 15, 15, 14 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 52, 2, 2 + vadduwm 17, 17, 8 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 28, 28, 20 + vadduwm 0, 0, 21 + vadduwm 15, 15, 18 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + vadduwm 17, 17, 22 + vadduwm 28, 28, 30 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 0, 0, 23 + vadduwm 15, 15, 7 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 5, 4, 4 + xxlor 4, 58, 58 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 39, 8, 8 + xxlor 54, 24, 24 + xxlor 58, 26, 26 + vadduwm 17, 17, 13 + vadduwm 28, 28, 7 + vadduwm 0, 0, 22 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 3, 53, 53 + xxlor 53, 4, 4 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vadduwm 17, 17, 21 + vadduwm 28, 28, 20 + vadduwm 0, 0, 18 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 2, 55, 55 + vmr 23, 18 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 50, 5, 5 + vadduwm 17, 17, 14 + vadduwm 28, 28, 30 + vadduwm 0, 0, 18 + vadduwm 15, 15, 22 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 25, 40, 40 + vmr 8, 13 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + xxlor 45, 25, 25 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 13 + xxlor 45, 2, 2 + vadduwm 0, 0, 8 + vadduwm 28, 28, 13 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 4, 57, 57 + xxlor 26, 46, 46 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 8, 62, 62 + xxlor 57, 3, 3 + xxlor 46, 7, 7 + xxlor 62, 6, 6 + vadduwm 17, 17, 7 + vadduwm 28, 28, 25 + vadduwm 0, 0, 14 + vadduwm 15, 15, 30 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vadduwm 17, 17, 20 + xxlor 3, 52, 52 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 52, 8, 8 + vadduwm 0, 0, 22 + vadduwm 28, 28, 20 + vadduwm 15, 15, 23 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 6, 55, 55 + xxlor 55, 4, 4 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 17, 17, 23 + vadduwm 28, 28, 13 + vadduwm 0, 0, 21 + vadduwm 15, 15, 14 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 4, 53, 53 + xxlor 53, 26, 26 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 21 + vadduwm 28, 28, 8 + vadduwm 0, 0, 7 + vadduwm 15, 15, 30 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 5, 25, 25 + xxlor 2, 58, 58 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vmr 22, 26 + vadduwm 0, 0, 26 + xxlor 58, 5, 5 + vadduwm 17, 17, 25 + vadduwm 28, 28, 18 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 7, 24, 24 + xxlor 8, 57, 57 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 57, 7, 7 + vadduwm 17, 17, 20 + vadduwm 28, 28, 13 + vadduwm 0, 0, 14 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 5, 52, 52 + xxlor 23, 45, 45 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 52, 6, 6 + vadduwm 28, 28, 8 + vmr 13, 8 + xxlor 40, 3, 3 + vadduwm 17, 17, 20 + vadduwm 0, 0, 8 + vadduwm 15, 15, 22 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 25, 39, 39 + vmr 7, 30 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vmr 30, 18 + xxlor 24, 46, 46 + xxlor 46, 25, 25 + xxlor 50, 8, 8 + vadduwm 17, 17, 23 + vadduwm 28, 28, 14 + vadduwm 0, 0, 18 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 6, 58, 58 + xxlor 58, 4, 4 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vadduwm 17, 17, 30 + vadduwm 28, 28, 26 + vadduwm 0, 0, 7 + vadduwm 15, 15, 21 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 40, 23, 23 + vadduwm 13, 28, 13 + vadduwm 8, 17, 8 + xxland 49, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 17, 31 + vadduwm 16, 29, 16 + vadduwm 28, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 60, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 2, 55, 55 + vmr 23, 30 + xxlor 62, 24, 24 + vadduwm 0, 0, 22 + vadduwm 15, 15, 30 + vadduwm 8, 4, 8 + vadduwm 13, 1, 13 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 40, 61 + xxlxor 51, 45, 51 + xxlxor 34, 32, 34 + xxlxor 49, 47, 49 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 17, 17, 17, 9 + vadduwm 13, 13, 14 + xxlor 46, 5, 5 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 49, 49, 10 + vadduwm 28, 29, 28 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 17, 16 + xxlxor 36, 60, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 8, 8, 25 + vadduwm 0, 0, 14 + vadduwm 15, 15, 7 + vadduwm 8, 4, 8 + vadduwm 13, 1, 13 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 62, 40, 61 + xxlxor 51, 45, 51 + xxlxor 34, 32, 34 + xxlxor 49, 47, 49 + vperm 30, 30, 30, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 17, 17, 17, 6 + vadduwm 29, 8, 20 + vadduwm 8, 13, 18 + xxland 45, 62, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 49, 49, 31 + vadduwm 30, 13, 28 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 17, 16 + xxlxor 36, 62, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 0, 0, 23 + vadduwm 7, 15, 21 + vadduwm 29, 1, 29 + vadduwm 8, 11, 8 + vadduwm 0, 5, 0 + vadduwm 7, 4, 7 + xxlxor 47, 61, 49 + xxlxor 45, 40, 45 + xxlxor 49, 32, 51 + xxlxor 34, 39, 34 + vperm 15, 15, 15, 9 + vperm 13, 13, 13, 9 + vperm 17, 17, 17, 9 + vperm 2, 2, 2, 9 + xxlor 46, 3, 3 + vadduwm 9, 29, 26 + vadduwm 8, 8, 14 + xxland 46, 47, 10 + xxland 45, 45, 10 + xxland 47, 49, 10 + xxland 34, 34, 10 + vadduwm 17, 14, 31 + vadduwm 16, 13, 16 + vadduwm 18, 15, 30 + vadduwm 3, 2, 3 + xxlxor 33, 49, 33 + xxlxor 43, 48, 43 + xxlxor 37, 50, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 44, 6, 6 + xxlor 0, 10, 10 + vadduwm 0, 0, 12 + xxlor 44, 2, 2 + vadduwm 9, 1, 9 + vadduwm 7, 7, 12 + vadduwm 8, 11, 8 + vadduwm 7, 4, 7 + vadduwm 0, 5, 0 + xxlxor 34, 39, 34 + xxlxor 44, 32, 47 + vperm 2, 2, 2, 6 + xxlxor 46, 41, 46 + xxlxor 45, 40, 45 + vperm 12, 12, 12, 6 + vperm 14, 14, 14, 6 + vperm 13, 13, 13, 6 + xxland 34, 34, 31 + xxlor 1, 31, 31 + vadduwm 3, 2, 3 + xxland 44, 44, 31 + xxlxor 36, 35, 36 + xxlxor 51, 35, 40 + xxland 35, 46, 31 + xxland 38, 45, 31 + vadduwm 15, 12, 18 + vadduwm 8, 3, 17 + vadduwm 13, 6, 16 + xxlxor 37, 47, 37 + xxlxor 33, 40, 33 + xxlxor 43, 45, 43 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlxor 47, 47, 41 + xxlxor 40, 40, 32 + xxlxor 39, 45, 39 + xxlxor 50, 36, 38 + xxlxor 63, 33, 44 + xxlxor 43, 43, 34 + xxlxor 41, 37, 35 + bne 0, .LBB3_2 +.LBB3_5: + vmrglw 2, 19, 15 + li 3, 32 + li 4, 48 + vmrglw 4, 7, 8 + vmrglw 0, 31, 18 + vmrglw 1, 9, 11 + vmrghw 3, 19, 15 + vmrghw 5, 7, 8 + vmrghw 6, 31, 18 + vmrghw 7, 9, 11 + xxmrgld 40, 36, 34 + xxmrghd 34, 36, 34 + xxmrgld 41, 33, 32 + xxswapd 0, 40 + xxmrgld 36, 37, 35 + xxmrghd 35, 37, 35 + xxmrghd 37, 33, 32 + xxswapd 1, 41 + xxmrgld 32, 39, 38 + xxmrghd 33, 39, 38 + xxswapd 2, 34 + xxswapd 4, 36 + xxswapd 3, 37 + stxvd2x 0, 0, 5 + xxswapd 5, 32 + stxvd2x 1, 5, 11 + xxswapd 0, 35 + xxswapd 1, 33 + stxvd2x 2, 5, 3 + li 3, 64 + stxvd2x 3, 5, 4 + li 4, 80 + stxvd2x 4, 5, 3 + li 3, 96 + stxvd2x 5, 5, 4 + li 4, 112 + stxvd2x 0, 5, 3 + stxvd2x 1, 5, 4 + li 3, 224 + lxvd2x 63, 1, 3 + li 3, 208 + lfd 31, 408(1) + ld 30, 304(1) + ld 29, 296(1) + lxvd2x 62, 1, 3 + li 3, 192 + lfd 30, 400(1) + ld 28, 288(1) + ld 27, 280(1) + lxvd2x 61, 1, 3 + li 3, 176 + lfd 29, 392(1) + ld 26, 272(1) + ld 25, 264(1) + lxvd2x 60, 1, 3 + li 3, 160 + lfd 28, 384(1) + ld 24, 256(1) + ld 23, 248(1) + lxvd2x 59, 1, 3 + li 3, 144 + lfd 27, 376(1) + ld 22, 240(1) + lxvd2x 58, 1, 3 + li 3, 128 + lfd 26, 368(1) + lxvd2x 57, 1, 3 + li 3, 112 + lfd 25, 360(1) + lxvd2x 56, 1, 3 + li 3, 96 + lfd 24, 352(1) + lxvd2x 55, 1, 3 + li 3, 80 + lfd 23, 344(1) + lxvd2x 54, 1, 3 + li 3, 64 + lfd 22, 336(1) + lxvd2x 53, 1, 3 + li 3, 48 + lfd 21, 328(1) + lxvd2x 52, 1, 3 + lfd 20, 320(1) + addi 1, 1, 416 + blr + .long 0 + .quad 0 +.Lfunc_end3: + .size blake3_hash4_sse41, .Lfunc_end3-.Lfunc_begin3 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S new file mode 100644 index 000000000..b15d8fc77 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -0,0 +1,1845 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_AVX2) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_avx2 +.text + +.type zfs_blake3_hash_many_avx2,@function +.p2align 6 +zfs_blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif + +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A +#endif /* HAVE_AVX2 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S new file mode 100644 index 000000000..d02c5e7ec --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S @@ -0,0 +1,2618 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_avx512 +.global zfs_blake3_compress_in_place_avx512 +.global zfs_blake3_compress_xof_avx512 +.text + +.type zfs_blake3_hash_many_avx512,@function +.type zfs_blake3_compress_xof_avx512,@function +.type zfs_blake3_compress_in_place_avx512,@function + +.p2align 6 +zfs_blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti32x8 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +zfs_blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +zfs_blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512 +.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512 +.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif + +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A + +#endif /* HAVE_AVX512 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S new file mode 100644 index 000000000..39d23ee23 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S @@ -0,0 +1,2323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_SSE2) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_sse2 +.global zfs_blake3_compress_in_place_sse2 +.global zfs_blake3_compress_xof_sse2 + +.text +.type zfs_blake3_hash_many_sse2,@function +.type zfs_blake3_compress_in_place_sse2,@function +.type zfs_blake3_compress_xof_sse2,@function + + .p2align 6 +zfs_blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +zfs_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +zfs_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2 +.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2 +.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF + +#endif /* HAVE_SSE2 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S new file mode 100644 index 000000000..1c40236f0 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S @@ -0,0 +1,2058 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_SSE4_1) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_compress_in_place_sse41 +.global zfs_blake3_compress_xof_sse41 +.global zfs_blake3_hash_many_sse41 + +.text +.type zfs_blake3_hash_many_sse41,@function +.type zfs_blake3_compress_in_place_sse41,@function +.type zfs_blake3_compress_xof_sse41,@function + +.p2align 6 +zfs_blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +zfs_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret +.p2align 6 +zfs_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41 +.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41 +.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + +#endif /* HAVE_SSE4_1 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index f09389e6d..4df09884a 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -696,16 +696,15 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { - - static const spa_feature_t zilsaxattr_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_ZILSAXATTR, - "org.openzfs:zilsaxattr", "zilsaxattr", - "Support for xattr=sa extended attribute logging in ZIL.", - ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, - ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); + static const spa_feature_t zilsaxattr_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ZILSAXATTR, + "org.openzfs:zilsaxattr", "zilsaxattr", + "Support for xattr=sa extended attribute logging in ZIL.", + ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, + ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); } zfeature_register(SPA_FEATURE_HEAD_ERRLOG, @@ -714,6 +713,18 @@ zpool_feature_init(void) ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t blake3_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BLAKE3, + "org.openzfs:blake3", "blake3", + "BLAKE3 hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + blake3_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 500d80a33..32475611e 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -84,6 +84,7 @@ zfs_prop_init(void) { "sha512", ZIO_CHECKSUM_SHA512 }, { "skein", ZIO_CHECKSUM_SKEIN }, { "edonr", ZIO_CHECKSUM_EDONR }, + { "blake3", ZIO_CHECKSUM_BLAKE3 }, { NULL } }; @@ -102,6 +103,9 @@ zfs_prop_init(void) ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, { "edonr,verify", ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, + { "blake3", ZIO_CHECKSUM_BLAKE3 }, + { "blake3,verify", + ZIO_CHECKSUM_BLAKE3 | ZIO_CHECKSUM_VERIFY }, { NULL } }; @@ -394,12 +398,12 @@ zfs_prop_init(void) ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein" - " | edonr", + " | edonr | blake3", "CHECKSUM", checksum_table, sfeatures); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | verify | sha256[,verify] | sha512[,verify] | " - "skein[,verify] | edonr,verify", + "skein[,verify] | edonr,verify | blake3[,verify]", "DEDUP", dedup_table, sfeatures); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, diff --git a/module/zfs/blake3_zfs.c b/module/zfs/blake3_zfs.c new file mode 100644 index 000000000..51c455fe7 --- /dev/null +++ b/module/zfs/blake3_zfs.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include <sys/zio_checksum.h> +#include <sys/blake3.h> +#include <sys/abd.h> + +static int +blake3_incremental(void *buf, size_t size, void *arg) +{ + BLAKE3_CTX *ctx = arg; + + Blake3_Update(ctx, buf, size); + + return (0); +} + +/* + * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using abd_checksum_blake3_tmpl_init. + */ +void +abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template, + zio_cksum_t *zcp) +{ + BLAKE3_CTX *ctx; + + ctx = kmem_alloc(sizeof (*ctx), KM_NOSLEEP); + ASSERT(ctx != 0); + ASSERT(ctx_template != 0); + + memcpy(ctx, ctx_template, sizeof (*ctx)); + (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx); + Blake3_Final(ctx, (uint8_t *)zcp); + + memset(ctx, 0, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} + +/* + * Byteswapped version of abd_checksum_blake3_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * BLAKE3 is internally endian-insensitive). + */ +void +abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + ASSERT(ctx_template != 0); + + abd_checksum_blake3_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum + * computations and returns a pointer to it. + */ +void * +abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt) +{ + BLAKE3_CTX *ctx; + + ASSERT(sizeof (salt->zcs_bytes) == 32); + + /* init reference object */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + Blake3_InitKeyed(ctx, salt->zcs_bytes); + + return (ctx); +} + +/* + * Frees a BLAKE3 context template previously allocated using + * zio_checksum_blake3_tmpl_init. + */ +void +abd_checksum_blake3_tmpl_free(void *ctx_template) +{ + BLAKE3_CTX *ctx = ctx_template; + + memset(ctx, 0, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 12aec4a56..c57c69bd7 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -30,6 +30,7 @@ */ #include <sys/zfs_context.h> +#include <sys/zfs_chksum.h> #include <sys/spa_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> @@ -2417,6 +2418,7 @@ spa_init(spa_mode_t mode) vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); + chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); @@ -2438,6 +2440,7 @@ spa_fini(void) vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); + chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c new file mode 100644 index 000000000..3ebe08541 --- /dev/null +++ b/module/zfs/zfs_chksum.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021 Tino Reichardt <[email protected]> + */ + +#include <sys/types.h> +#include <sys/spa.h> +#include <sys/zio_checksum.h> +#include <sys/zfs_context.h> +#include <sys/zfs_chksum.h> + +#include <sys/blake3.h> + +static kstat_t *chksum_kstat = NULL; + +typedef struct { + const char *name; + const char *impl; + uint64_t bs1k; + uint64_t bs4k; + uint64_t bs16k; + uint64_t bs64k; + uint64_t bs256k; + uint64_t bs1m; + uint64_t bs4m; + zio_cksum_salt_t salt; + zio_checksum_t *(func); + zio_checksum_tmpl_init_t *(init); + zio_checksum_tmpl_free_t *(free); +} chksum_stat_t; + +static int chksum_stat_cnt = 0; +static chksum_stat_t *chksum_stat_data = 0; + +/* + * i3-1005G1 test output: + * + * implementation 1k 4k 16k 64k 256k 1m 4m + * fletcher-4 5421 15001 26468 32555 34720 32801 18847 + * edonr-generic 1196 1602 1761 1749 1762 1759 1751 + * skein-generic 546 591 608 615 619 612 616 + * sha256-generic 246 270 274 274 277 275 276 + * sha256-avx 262 296 304 307 307 307 306 + * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228 + * sha256-openssl 240 300 316 314 304 285 276 + * sha512-generic 333 374 385 392 391 393 392 + * sha512-openssl 353 441 467 476 472 467 426 + * sha512-avx 362 444 473 475 479 476 478 + * sha512-avx2 394 500 530 538 543 545 542 + * blake3-generic 308 313 313 313 312 313 312 + * blake3-sse2 402 1289 1423 1446 1432 1458 1413 + * blake3-sse41 427 1470 1625 1704 1679 1607 1629 + * blake3-avx2 428 1920 3095 3343 3356 3318 3204 + * blake3-avx512 473 2687 4905 5836 5844 5643 5374 + */ +static int +chksum_stat_kstat_headers(char *buf, size_t size) +{ + ssize_t off = 0; + + off += snprintf(buf + off, size, "%-23s", "implementation"); + off += snprintf(buf + off, size - off, "%8s", "1k"); + off += snprintf(buf + off, size - off, "%8s", "4k"); + off += snprintf(buf + off, size - off, "%8s", "16k"); + off += snprintf(buf + off, size - off, "%8s", "64k"); + off += snprintf(buf + off, size - off, "%8s", "256k"); + off += snprintf(buf + off, size - off, "%8s", "1m"); + (void) snprintf(buf + off, size - off, "%8s\n", "4m"); + + return (0); +} + +static int +chksum_stat_kstat_data(char *buf, size_t size, void *data) +{ + chksum_stat_t *cs; + ssize_t off = 0; + char b[24]; + + cs = (chksum_stat_t *)data; + snprintf(b, 23, "%s-%s", cs->name, cs->impl); + off += snprintf(buf + off, size - off, "%-23s", b); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs1k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs4k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs16k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs64k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs256k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs1m); + (void) snprintf(buf + off, size - off, "%8llu\n", + (u_longlong_t)cs->bs4m); + + return (0); +} + +static void * +chksum_stat_kstat_addr(kstat_t *ksp, loff_t n) +{ + if (n < chksum_stat_cnt) + ksp->ks_private = (void *)(chksum_stat_data + n); + else + ksp->ks_private = NULL; + + return (ksp->ks_private); +} + +static void +chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, + uint64_t *result) +{ + hrtime_t start; + uint64_t run_bw, run_time_ns, run_count = 0, size = 0; + uint32_t l, loops = 0; + zio_cksum_t zcp; + + switch (round) { + case 1: /* 1k */ + size = 1<<10; loops = 128; break; + case 2: /* 2k */ + size = 1<<12; loops = 64; break; + case 3: /* 4k */ + size = 1<<14; loops = 32; break; + case 4: /* 16k */ + size = 1<<16; loops = 16; break; + case 5: /* 256k */ + size = 1<<18; loops = 8; break; + case 6: /* 1m */ + size = 1<<20; loops = 4; break; + case 7: /* 4m */ + size = 1<<22; loops = 1; break; + } + + kpreempt_disable(); + start = gethrtime(); + do { + for (l = 0; l < loops; l++, run_count++) + cs->func(abd, size, ctx, &zcp); + + run_time_ns = gethrtime() - start; + } while (run_time_ns < MSEC2NSEC(1)); + kpreempt_enable(); + + run_bw = size * run_count * NANOSEC; + run_bw /= run_time_ns; /* B/s */ + *result = run_bw/1024/1024; /* MiB/s */ +} + +static void +chksum_benchit(chksum_stat_t *cs) +{ + abd_t *abd; + void *ctx = 0; + void *salt = &cs->salt.zcs_bytes; + + /* allocate test memory via default abd interface */ + abd = abd_alloc_linear(1<<22, B_FALSE); + memset(salt, 0, sizeof (cs->salt.zcs_bytes)); + if (cs->init) { + ctx = cs->init(&cs->salt); + } + + chksum_run(cs, abd, ctx, 1, &cs->bs1k); + chksum_run(cs, abd, ctx, 2, &cs->bs4k); + chksum_run(cs, abd, ctx, 3, &cs->bs16k); + chksum_run(cs, abd, ctx, 4, &cs->bs64k); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + chksum_run(cs, abd, ctx, 6, &cs->bs1m); + chksum_run(cs, abd, ctx, 7, &cs->bs4m); + + /* free up temp memory */ + if (cs->free) { + cs->free(ctx); + } + abd_free(abd); +} + +/* + * Initialize and benchmark all supported implementations. + */ +static void +chksum_benchmark(void) +{ + +#ifndef _KERNEL + /* we need the benchmark only for the kernel module */ + return; +#endif + + chksum_stat_t *cs; + int cbid = 0, id; + uint64_t max = 0; + + /* space for the benchmark times */ + chksum_stat_cnt = 4; + chksum_stat_cnt += blake3_get_impl_count(); + chksum_stat_data = (chksum_stat_t *)kmem_zalloc( + sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); + + /* edonr */ + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_edonr_tmpl_init; + cs->func = abd_checksum_edonr_native; + cs->free = abd_checksum_edonr_tmpl_free; + cs->name = "edonr"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* skein */ + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_skein_tmpl_init; + cs->func = abd_checksum_skein_native; + cs->free = abd_checksum_skein_tmpl_free; + cs->name = "skein"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* sha256 */ + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_SHA256; + cs->free = 0; + cs->name = "sha256"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* sha512 */ + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_SHA512_native; + cs->free = 0; + cs->name = "sha512"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* blake3 */ + for (id = 0; id < blake3_get_impl_count(); id++) { + blake3_set_impl_id(id); + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_blake3_tmpl_init; + cs->func = abd_checksum_blake3_native; + cs->free = abd_checksum_blake3_tmpl_free; + cs->name = "blake3"; + cs->impl = blake3_get_impl_name(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + blake3_set_impl_fastest(id); + } + } +} + +void +chksum_init(void) +{ + + /* Benchmark supported implementations */ + chksum_benchmark(); + + /* Install kstats for all implementations */ + chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + if (chksum_kstat != NULL) { + chksum_kstat->ks_data = NULL; + chksum_kstat->ks_ndata = UINT32_MAX; + kstat_set_raw_ops(chksum_kstat, + chksum_stat_kstat_headers, + chksum_stat_kstat_data, + chksum_stat_kstat_addr); + kstat_install(chksum_kstat); + } + + /* setup implementations */ + blake3_setup_impl(); +} + +void +chksum_fini(void) +{ + if (chksum_kstat != NULL) { + kstat_delete(chksum_kstat); + chksum_kstat = NULL; + } + + if (chksum_stat_cnt) { + kmem_free(chksum_stat_data, + sizeof (chksum_stat_t) * chksum_stat_cnt); + chksum_stat_cnt = 0; + chksum_stat_data = 0; + } +} diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index d89e57653..3c5cdf604 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, + {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap}, + abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}, }; /* @@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum) VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); switch (cksum) { + case ZIO_CHECKSUM_BLAKE3: + return (SPA_FEATURE_BLAKE3); case ZIO_CHECKSUM_SHA512: return (SPA_FEATURE_SHA512); case ZIO_CHECKSUM_SKEIN: diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 4ff46e7af..243221598 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -113,8 +113,8 @@ tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] -tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'filetest_001_pos', - 'filetest_002_pos'] +tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test', + 'filetest_001_pos', 'filetest_002_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 1830cab76..20d138253 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -42,6 +42,7 @@ /ereports /zfs_diff-socket /dosmode_readonly_write +/blake3_test /edonr_test /skein_test /sha2_test diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index e3c9874dc..3c8faf5af 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -98,15 +98,19 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/ereports libzfs.la -scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test %D%/sha2_test +scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test \ + %D%/sha2_test %D%/blake3_test %C%_skein_test_SOURCES = %D%/checksum/skein_test.c %C%_sha2_test_SOURCES = %D%/checksum/sha2_test.c %C%_edonr_test_SOURCES = %D%/checksum/edonr_test.c +%C%_blake3_test_SOURCES = %D%/checksum/blake3_test.c %C%_skein_test_LDADD = \ libicp.la \ + libspl.la \ libspl_assert.la %C%_sha2_test_LDADD = $(%C%_skein_test_LDADD) %C%_edonr_test_LDADD = $(%C%_skein_test_LDADD) +%C%_blake3_test_LDADD = $(%C%_skein_test_LDADD) if BUILD_LINUX diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c new file mode 100644 index 000000000..55d268f5f --- /dev/null +++ b/tests/zfs-tests/cmd/checksum/blake3_test.c @@ -0,0 +1,575 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> +#include <stdio.h> +#include <sys/time.h> +#include <sys/blake3.h> + +/* + * set it to a define for debugging + */ +#undef BLAKE3_DEBUG + +/* + * C version of: + * https://github.com/BLAKE3-team/BLAKE3/tree/master/test_vectors + */ +typedef struct { + /* input length for this entry */ + const int input_len; + + /* hash value */ + const char *hash; + + /* salted hash value */ + const char *shash; +} blake3_test_t; + +/* BLAKE3 is variable here */ +#define TEST_DIGEST_LEN 262 + +/* + * key for the keyed hashing + */ +static const char *salt = "whats the Elvish word for friend"; + +static blake3_test_t TestArray[] = { + { + 0, + "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e0" + "0f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5" + "487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c2" + "2e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", + "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b1" + "8171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73" + "cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be589" + "60856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", + }, + { + 1, + "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3" + "a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358a" + "d4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4" + "081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", + "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b65" + "68c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0c" + "f7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f9" + "8fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", + }, + { + 2, + "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8" + "386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac" + "61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a43226" + "3a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1", + "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9f" + "fbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f65" + "8be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786" + "024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f", + }, + { + 3, + "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b" + "49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454" + "b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cd" + "d0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134", + "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9" + "142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1" + "a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690" + "bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f", + }, + { + 4, + "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f" + "603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b" + "56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e657972117" + "01dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12", + "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe07011" + "6c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf" + "81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf" + "0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a", + }, + { + 5, + "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2eb" + "cfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2c" + "a748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c9999" + "04037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620", + "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616a" + "b199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b" + "38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0" + "d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218", + }, + { + 6, + "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c84461" + "1a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435" + "d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac" + "1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a", + "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2" + "ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e880780084" + "2a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887" + "603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256", + }, + { + 7, + "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66" + "036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a9" + "41f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fe" + "f1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c", + "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c" + "5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5f" + "d6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617" + "bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6", + }, + { + 8, + "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb72" + "5d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f" + "9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a2" + "2e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c", + "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048" + "eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d" + "13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305ab" + "f86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276", + }, + { + 63, + "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b11" + "97012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf46" + "87093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3" + "ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755", + "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea0" + "5a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847ab" + "b38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f116783" + "77483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d", + }, + { + 64, + "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc" + "9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7f" + "bb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95" + "b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74", + "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e682" + "44c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f7" + "7a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c92" + "55306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb", + }, + { + 65, + "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e" + "16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3" + "a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d" + "0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c", + "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5" + "e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc" + "5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b904149" + "7de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad", + }, + { + 127, + "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3" + "137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da4" + "7644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc4435" + "5b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78", + "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee" + "7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd5" + "4663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d" + "135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc", + }, + { + 128, + "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa6" + "9faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ec" + "ba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f7" + "5e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c", + "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd8" + "6bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50" + "edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad" + "92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5", + }, + { + 129, + "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f9" + "6ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c71" + "27bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7" + "e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7", + "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c" + "9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aa" + "ee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412c" + "d8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683", + }, + { + 1023, + "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a1" + "82d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56" + "778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b2" + "8f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", + "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e89" + "0316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13e" + "fd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc" + "97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", + }, + { + 1024, + "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71c" + "f8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f" + "6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d91" + "7f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", + "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a7" + "8bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a" + "8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b50002" + "36df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", + }, + { + 1025, + "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4" + "c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332" + "b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f9" + "55c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", + "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea6936" + "2396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd535" + "2720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123" + "872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", + }, + { + 2048, + "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a" + "60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d0" + "63f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c6" + "7ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", + "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd101" + "73b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b2" + "2f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef860" + "54f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", + }, + { + 2049, + "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096" + "de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae9" + "8764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d9042" + "5a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", + "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9" + "a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81" + "447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c6464" + "99ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", + }, + { + 3072, + "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a" + "3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d12" + "0258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d15" + "99b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", + "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022" + "f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c78" + "3a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b" + "996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", + }, + { + 3073, + "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a" + "27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b" + "639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd" + "66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", + "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96" + "d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfdd" + "d6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea" + "2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", + }, + { + 4096, + "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e96902" + "89e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85" + "c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0" + "062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", + "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bb" + "b64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c" + "757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb1" + "7d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", + }, + { + 4097, + "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505" + "f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd" + "26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61db" + "e091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", + "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc60" + "6db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce" + "595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e9" + "00809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", + }, + { + 5120, + "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833ac" + "c61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a" + "0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9" + "321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", + "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b" + "4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495" + "f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f" + "9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", + }, + { + 5121, + "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96" + "adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647" + "eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204" + "ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", + "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d" + "07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c810" + "50b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092" + "133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", + }, + { + 6144, + "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d" + "742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade1" + "56c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c" + "6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", + "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc3" + "5754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f79075" + "61f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486" + "b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", + }, + { + 6145, + "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18" + "a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb01501" + "5532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a" + "3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", + "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3" + "c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283" + "ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f" + "372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", + }, + { + 7168, + "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a57" + "07c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165" + "b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de" + "4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", + "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2" + "f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba34840098" + "9a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3e" + "aebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", + }, + { + 7169, + "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798" + "a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9" + "b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a735485228" + "40779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", + "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c" + "9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabc" + "b438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc8566" + "17c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", + }, + { + 8192, + "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635f" + "e51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a4777" + "8566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62" + "712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", + "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a48" + "34464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc" + "40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121" + "cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", + }, + { + 8193, + "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2" + "282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea6" + "0bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0" + "b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", + "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f0" + "3228648fd983aef045c2fa8290934b0866b615f585149587dda229903996532883" + "5a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e0" + "9df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", + }, + { + 16384, + "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d" + "764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb3" + "9a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e475" + "03f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", + "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9" + "e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960" + "ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725" + "581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", + }, + { + 31744, + "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c4786" + "0cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f" + "5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac97" + "8bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", + "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a" + "7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628" + "be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c" + "2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", + }, + { + 102400, + "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e0" + "1c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f" + "0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c" + "009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e", + "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9" + "dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2" + "aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b" + "354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4", + }, + { + 0, 0, 0 + } +}; + +#ifdef BLAKE3_DEBUG +#define dprintf printf +#else +#define dprintf(...) +#endif + +static char fmt_tohex(char c); +static size_t fmt_hexdump(char *dest, const char *src, size_t len); + +static char fmt_tohex(char c) { + return ((char)(c >= 10 ? c-10+'a' : c+'0')); +} + +static size_t fmt_hexdump(char *dest, const char *src, size_t len) { + register const unsigned char *s = (const unsigned char *) src; + size_t written = 0, i; + + if (!dest) + return ((len > ((size_t)-1)/2) ? (size_t)-1 : len*2); + for (i = 0; i < len; ++i) { + dest[written] = fmt_tohex(s[i]>>4); + dest[written+1] = fmt_tohex(s[i]&15); + written += 2; + } + + return (written); +} + +int +main(int argc, char *argv[]) +{ + boolean_t failed = B_FALSE; + uint8_t buffer[102400]; + uint64_t cpu_mhz = 0; + int id, i, j; + + if (argc == 2) + cpu_mhz = atoi(argv[1]); + + /* fill test message */ + for (i = 0, j = 0; i < sizeof (buffer); i++, j++) { + if (j == 251) + j = 0; + buffer[i] = (uint8_t)j; + } + + (void) printf("Running algorithm correctness tests:\n"); + for (id = 0; id < blake3_get_impl_count(); id++) { + blake3_set_impl_id(id); + const char *name = blake3_get_impl_name(); + dprintf("Result for BLAKE3-%s:\n", name); + for (i = 0; TestArray[i].hash; i++) { + blake3_test_t *cur = &TestArray[i]; + + BLAKE3_CTX ctx; + uint8_t digest[TEST_DIGEST_LEN]; + char result[TEST_DIGEST_LEN]; + + /* default hashing */ + Blake3_Init(&ctx); + Blake3_Update(&ctx, buffer, cur->input_len); + Blake3_FinalSeek(&ctx, 0, digest, TEST_DIGEST_LEN); + fmt_hexdump(result, (char *)digest, 131); + if (memcmp(result, cur->hash, 131) != 0) + failed = B_TRUE; + + dprintf("HASH-res: %s\n", result); + dprintf("HASH-ref: %s\n", cur->hash); + + /* salted hashing */ + Blake3_InitKeyed(&ctx, (const uint8_t *)salt); + Blake3_Update(&ctx, buffer, cur->input_len); + Blake3_FinalSeek(&ctx, 0, digest, TEST_DIGEST_LEN); + fmt_hexdump(result, (char *)digest, 131); + if (memcmp(result, cur->shash, 131) != 0) + failed = B_TRUE; + + dprintf("SHASH-res: %s\n", result); + dprintf("SHASH-ref: %s\n", cur->shash); + + printf("BLAKE3-%s Message (inlen=%d)\tResult: %s\n", + name, cur->input_len, failed?"FAILED!":"OK"); + } + } + + if (failed) + return (1); + +#define BLAKE3_PERF_TEST(impl, diglen) \ + do { \ + BLAKE3_CTX ctx; \ + uint8_t digest[diglen / 8]; \ + uint8_t block[131072]; \ + uint64_t delta; \ + double cpb = 0; \ + int i; \ + struct timeval start, end; \ + memset(block, 0, sizeof (block)); \ + (void) gettimeofday(&start, NULL); \ + Blake3_Init(&ctx); \ + for (i = 0; i < 8192; i++) \ + Blake3_Update(&ctx, block, sizeof (block)); \ + Blake3_Final(&ctx, digest); \ + (void) gettimeofday(&end, NULL); \ + delta = (end.tv_sec * 1000000llu + end.tv_usec) - \ + (start.tv_sec * 1000000llu + start.tv_usec); \ + if (cpu_mhz != 0) { \ + cpb = (cpu_mhz * 1e6 * ((double)delta / \ + 1000000)) / (8192 * 128 * 1024); \ + } \ + (void) printf("BLAKE3-%s %llu us (%.02f CPB)\n", impl, \ + (u_longlong_t)delta, cpb); \ + } while (0) + + printf("Running performance tests (hashing 1024 MiB of data):\n"); + for (id = 0; id < blake3_get_impl_count(); id++) { + blake3_set_impl_id(id); + const char *name = blake3_get_impl_name(); + BLAKE3_PERF_TEST(name, 256); + } + + return (0); +} diff --git a/tests/zfs-tests/cmd/checksum/edonr_test.c b/tests/zfs-tests/cmd/checksum/edonr_test.c index c6365a414..3a0a48533 100644 --- a/tests/zfs-tests/cmd/checksum/edonr_test.c +++ b/tests/zfs-tests/cmd/checksum/edonr_test.c @@ -28,9 +28,6 @@ * gettimeofday due to -D_KERNEL (we can do this since we're actually * running in userspace, but we need -D_KERNEL for the remaining Edon-R code). */ -#ifdef _KERNEL -#undef _KERNEL -#endif #include <sys/edonr.h> #include <stdlib.h> diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c index dc4173e10..bb3553110 100644 --- a/tests/zfs-tests/cmd/checksum/sha2_test.c +++ b/tests/zfs-tests/cmd/checksum/sha2_test.c @@ -28,9 +28,6 @@ * gettimeofday due to -D_KERNEL (we can do this since we're actually * running in userspace, but we need -D_KERNEL for the remaining SHA2 code). */ -#ifdef _KERNEL -#undef _KERNEL -#endif #include <stdarg.h> #include <stdlib.h> diff --git a/tests/zfs-tests/cmd/checksum/skein_test.c b/tests/zfs-tests/cmd/checksum/skein_test.c index 99b47b453..13611c860 100644 --- a/tests/zfs-tests/cmd/checksum/skein_test.c +++ b/tests/zfs-tests/cmd/checksum/skein_test.c @@ -28,9 +28,6 @@ * gettimeofday due to -D_KERNEL (we can do this since we're actually * running in userspace, but we need -D_KERNEL for the remaining Skein code). */ -#ifdef _KERNEL -#undef _KERNEL -#endif #include <sys/skein.h> #include <stdlib.h> diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 9dc2b4d0e..99430bc10 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -212,6 +212,7 @@ export ZFSTEST_FILES='badsend zed_fd_spill-zedlet suid_write_to_file cp_files + blake3_test edonr_test skein_test sha2_test diff --git a/tests/zfs-tests/include/properties.shlib b/tests/zfs-tests/include/properties.shlib index ba82f9620..14b3f4415 100644 --- a/tests/zfs-tests/include/properties.shlib +++ b/tests/zfs-tests/include/properties.shlib @@ -17,7 +17,7 @@ typeset -a compress_prop_vals=('off' 'lzjb' 'lz4' 'gzip' 'zle' 'zstd') typeset -a checksum_prop_vals=('on' 'off' 'fletcher2' 'fletcher4' 'sha256' - 'noparity' 'sha512' 'skein') + 'noparity' 'sha512' 'skein' 'blake3') if ! is_freebsd; then checksum_prop_vals+=('edonr') fi diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index a91a24d16..ffc087351 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -545,6 +545,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/checksum/cleanup.ksh \ functional/checksum/filetest_001_pos.ksh \ functional/checksum/filetest_002_pos.ksh \ + functional/checksum/run_blake3_test.ksh \ functional/checksum/run_edonr_test.ksh \ functional/checksum/run_sha2_test.ksh \ functional/checksum/run_skein_test.ksh \ diff --git a/tests/zfs-tests/tests/functional/checksum/default.cfg b/tests/zfs-tests/tests/functional/checksum/default.cfg index afb956093..a7e143e75 100644 --- a/tests/zfs-tests/tests/functional/checksum/default.cfg +++ b/tests/zfs-tests/tests/functional/checksum/default.cfg @@ -30,4 +30,4 @@ . $STF_SUITE/include/libtest.shlib -set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" +set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "blake3" "sha256" "sha512" "skein" "edonr" diff --git a/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh b/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh new file mode 100755 index 000000000..cf1ca7032 --- /dev/null +++ b/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015, 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# Run the tests for the BLAKE3 hash algorithm. +# + +log_assert "Run the tests for the BLAKE3 hash algorithm." + +freq=$(get_cpu_freq) +log_must blake3_test $freq + +log_pass "BLAKE3 tests passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh index 27003b21b..cab7c185e 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh @@ -46,7 +46,7 @@ verify_runnable "both" set -A dataset "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL/$TESTVOL" -set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "noparity" +set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "blake3" "noparity" log_assert "Setting a valid checksum on a file system, volume," \ "it should be successful." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 4ea5725e0..7849ed226 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -99,5 +99,6 @@ if is_linux || is_freebsd; then "feature@zstd_compress" "feature@zilsaxattr" "feature@head_errlog" + "feature@blake3" ) fi |