aboutsummaryrefslogtreecommitdiffstats
path: root/module/icp
diff options
context:
space:
mode:
authorTino Reichardt <[email protected]>2022-06-09 00:55:57 +0200
committerGitHub <[email protected]>2022-06-08 15:55:57 -0700
commit985c33b132f6c23a69bd808e008ae0f46131a31e (patch)
tree4d973e14592e15a4908ae3de6d61cf3270a1b37c /module/icp
parentb9d98453f9387c413f91d1d9cdb0cba8e04dbd95 (diff)
Introduce BLAKE3 checksums as an OpenZFS feature
This commit adds BLAKE3 checksums to OpenZFS, it has similar performance to Edon-R, but without the caveats around the latter. Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3 Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3 Short description of Wikipedia: BLAKE3 is a cryptographic hash function based on Bao and BLAKE2, created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real World Crypto. BLAKE3 is a single algorithm with many desirable features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE and BLAKE2, which are algorithm families with multiple variants. BLAKE3 has a binary tree structure, so it supports a practically unlimited degree of parallelism (both SIMD and multithreading) given enough input. The official Rust and C implementations are dual-licensed as public domain (CC0) and the Apache License. Along with adding the BLAKE3 hash into the OpenZFS infrastructure a new benchmarking file called chksum_bench was introduced. When read it reports the speed of the available checksum functions. On Linux: cat /proc/spl/kstat/zfs/chksum_bench On FreeBSD: sysctl kstat.zfs.misc.chksum_bench This is an example output of an i3-1005G1 test system with Debian 11: implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1196 1602 1761 1749 1762 1759 1751 skein-generic 546 591 608 615 619 612 616 sha256-generic 240 300 316 314 304 285 276 sha512-generic 353 441 467 476 472 467 426 blake3-generic 308 313 313 313 312 313 312 blake3-sse2 402 1289 1423 1446 1432 1458 1413 blake3-sse41 427 1470 1625 1704 1679 1607 1629 blake3-avx2 428 1920 3095 3343 3356 3318 3204 blake3-avx512 473 2687 4905 5836 5844 5643 5374 Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1840 2458 2665 2719 2711 2723 2693 skein-generic 870 966 996 992 1003 1005 1009 sha256-generic 415 442 453 455 457 457 457 sha512-generic 608 690 711 718 719 720 721 blake3-generic 301 313 311 309 309 310 310 blake3-sse2 343 1865 2124 2188 2180 2181 2186 blake3-sse41 364 2091 2396 2509 2463 2482 2488 blake3-avx2 365 2590 4399 4971 4915 4802 4764 Output on Debian 5.10.0-9-powerpc64le system: (POWER 9) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1213 1703 1889 1918 1957 1902 1907 skein-generic 434 492 520 522 511 525 525 sha256-generic 167 183 187 188 188 187 188 sha512-generic 186 216 222 221 225 224 224 blake3-generic 153 152 154 153 151 153 153 blake3-sse2 391 1170 1366 1406 1428 1426 1414 blake3-sse41 352 1049 1212 1174 1262 1258 1259 Output on Debian 5.10.0-11-arm64 system: (Pi400) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 487 603 629 639 643 641 641 skein-generic 271 299 303 308 309 309 307 sha256-generic 117 127 128 130 130 129 130 sha512-generic 145 165 170 172 173 174 175 blake3-generic 81 29 71 89 89 89 89 blake3-sse2 112 323 368 379 380 371 374 blake3-sse41 101 315 357 368 369 364 360 Structurally, the new code is mainly split into these parts: - 1x cross platform generic c variant: blake3_generic.c - 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512) - 2x assembly for ARMv8 (NEON converted from SSE2) - 2x assembly for PPC64-LE (POWER8 converted from SSE2) - one file for switching between the implementations Note the PPC64 assembly requires the VSX instruction set and the kfpu_begin() / kfpu_end() calls on PowerPC were updated accordingly. Reviewed-by: Felix Dörre <[email protected]> Reviewed-by: Ahelenia Ziemiańska <[email protected]> Reviewed-by: Brian Behlendorf <[email protected]> Signed-off-by: Tino Reichardt <[email protected]> Co-authored-by: Rich Ercolani <[email protected]> Closes #10058 Closes #12918
Diffstat (limited to 'module/icp')
-rw-r--r--module/icp/algs/blake3/blake3.c732
-rw-r--r--module/icp/algs/blake3/blake3_generic.c202
-rw-r--r--module/icp/algs/blake3/blake3_impl.c256
-rw-r--r--module/icp/algs/blake3/blake3_impl.h213
-rw-r--r--module/icp/algs/blake3/blake3_x86-64.c248
-rw-r--r--module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S2450
-rw-r--r--module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S2463
-rw-r--r--module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S2823
-rw-r--r--module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S3064
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_avx2.S1845
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_avx512.S2618
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_sse2.S2323
-rw-r--r--module/icp/asm-x86_64/blake3/blake3_sse41.S2058
13 files changed, 21295 insertions, 0 deletions
diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c
new file mode 100644
index 000000000..8c9c06eb9
--- /dev/null
+++ b/module/icp/algs/blake3/blake3.c
@@ -0,0 +1,732 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/blake3.h>
+
+#include "blake3_impl.h"
+
+/*
+ * We need 1056 byte stack for blake3_compress_subtree_wide()
+ * - we define this pragma to make gcc happy
+ */
+#if defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+/* internal used */
+typedef struct {
+ uint32_t input_cv[8];
+ uint64_t counter;
+ uint8_t block[BLAKE3_BLOCK_LEN];
+ uint8_t block_len;
+ uint8_t flags;
+} output_t;
+
+/* internal flags */
+enum blake3_flags {
+ CHUNK_START = 1 << 0,
+ CHUNK_END = 1 << 1,
+ PARENT = 1 << 2,
+ ROOT = 1 << 3,
+ KEYED_HASH = 1 << 4,
+ DERIVE_KEY_CONTEXT = 1 << 5,
+ DERIVE_KEY_MATERIAL = 1 << 6,
+};
+
+/* internal start */
+static void chunk_state_init(blake3_chunk_state_t *ctx,
+ const uint32_t key[8], uint8_t flags)
+{
+ memcpy(ctx->cv, key, BLAKE3_KEY_LEN);
+ ctx->chunk_counter = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ ctx->buf_len = 0;
+ ctx->blocks_compressed = 0;
+ ctx->flags = flags;
+}
+
+static void chunk_state_reset(blake3_chunk_state_t *ctx,
+ const uint32_t key[8], uint64_t chunk_counter)
+{
+ memcpy(ctx->cv, key, BLAKE3_KEY_LEN);
+ ctx->chunk_counter = chunk_counter;
+ ctx->blocks_compressed = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ ctx->buf_len = 0;
+}
+
+static size_t chunk_state_len(const blake3_chunk_state_t *ctx)
+{
+ return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) +
+ ((size_t)ctx->buf_len);
+}
+
+static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx,
+ const uint8_t *input, size_t input_len)
+{
+ size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len);
+ if (take > input_len) {
+ take = input_len;
+ }
+ uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len);
+ memcpy(dest, input, take);
+ ctx->buf_len += (uint8_t)take;
+ return (take);
+}
+
+static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx)
+{
+ if (ctx->blocks_compressed == 0) {
+ return (CHUNK_START);
+ } else {
+ return (0);
+ }
+}
+
+static output_t make_output(const uint32_t input_cv[8],
+ const uint8_t *block, uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ output_t ret;
+ memcpy(ret.input_cv, input_cv, 32);
+ memcpy(ret.block, block, BLAKE3_BLOCK_LEN);
+ ret.block_len = block_len;
+ ret.counter = counter;
+ ret.flags = flags;
+ return (ret);
+}
+
+/*
+ * Chaining values within a given chunk (specifically the compress_in_place
+ * interface) are represented as words. This avoids unnecessary bytes<->words
+ * conversion overhead in the portable implementation. However, the hash_many
+ * interface handles both user input and parent node blocks, so it accepts
+ * bytes. For that reason, chaining values in the CV stack are represented as
+ * bytes.
+ */
+static void output_chaining_value(const blake3_impl_ops_t *ops,
+ const output_t *ctx, uint8_t cv[32])
+{
+ uint32_t cv_words[8];
+ memcpy(cv_words, ctx->input_cv, 32);
+ ops->compress_in_place(cv_words, ctx->block, ctx->block_len,
+ ctx->counter, ctx->flags);
+ store_cv_words(cv, cv_words);
+}
+
+static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx,
+ uint64_t seek, uint8_t *out, size_t out_len)
+{
+ uint64_t output_block_counter = seek / 64;
+ size_t offset_within_block = seek % 64;
+ uint8_t wide_buf[64];
+ while (out_len > 0) {
+ ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len,
+ output_block_counter, ctx->flags | ROOT, wide_buf);
+ size_t available_bytes = 64 - offset_within_block;
+ size_t memcpy_len;
+ if (out_len > available_bytes) {
+ memcpy_len = available_bytes;
+ } else {
+ memcpy_len = out_len;
+ }
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
+ out += memcpy_len;
+ out_len -= memcpy_len;
+ output_block_counter += 1;
+ offset_within_block = 0;
+ }
+}
+
+static void chunk_state_update(const blake3_impl_ops_t *ops,
+ blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len)
+{
+ if (ctx->buf_len > 0) {
+ size_t take = chunk_state_fill_buf(ctx, input, input_len);
+ input += take;
+ input_len -= take;
+ if (input_len > 0) {
+ ops->compress_in_place(ctx->cv, ctx->buf,
+ BLAKE3_BLOCK_LEN, ctx->chunk_counter,
+ ctx->flags|chunk_state_maybe_start_flag(ctx));
+ ctx->blocks_compressed += 1;
+ ctx->buf_len = 0;
+ memset(ctx->buf, 0, BLAKE3_BLOCK_LEN);
+ }
+ }
+
+ while (input_len > BLAKE3_BLOCK_LEN) {
+ ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN,
+ ctx->chunk_counter,
+ ctx->flags|chunk_state_maybe_start_flag(ctx));
+ ctx->blocks_compressed += 1;
+ input += BLAKE3_BLOCK_LEN;
+ input_len -= BLAKE3_BLOCK_LEN;
+ }
+
+ size_t take = chunk_state_fill_buf(ctx, input, input_len);
+ input += take;
+ input_len -= take;
+}
+
+static output_t chunk_state_output(const blake3_chunk_state_t *ctx)
+{
+ uint8_t block_flags =
+ ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END;
+ return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter,
+ block_flags));
+}
+
+static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN],
+ const uint32_t key[8], uint8_t flags)
+{
+ return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT));
+}
+
+/*
+ * Given some input larger than one chunk, return the number of bytes that
+ * should go in the left subtree. This is the largest power-of-2 number of
+ * chunks that leaves at least 1 byte for the right subtree.
+ */
+static size_t left_len(size_t content_len)
+{
+ /*
+ * Subtract 1 to reserve at least one byte for the right side.
+ * content_len
+ * should always be greater than BLAKE3_CHUNK_LEN.
+ */
+ size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN;
+ return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN);
+}
+
+/*
+ * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time
+ * on a single thread. Write out the chunk chaining values and return the
+ * number of chunks hashed. These chunks are never the root and never empty;
+ * those cases use a different codepath.
+ */
+static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+ const uint8_t *chunks_array[MAX_SIMD_DEGREE];
+ size_t input_position = 0;
+ size_t chunks_array_len = 0;
+ while (input_len - input_position >= BLAKE3_CHUNK_LEN) {
+ chunks_array[chunks_array_len] = &input[input_position];
+ input_position += BLAKE3_CHUNK_LEN;
+ chunks_array_len += 1;
+ }
+
+ ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN /
+ BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START,
+ CHUNK_END, out);
+
+ /*
+ * Hash the remaining partial chunk, if there is one. Note that the
+ * empty chunk (meaning the empty message) is a different codepath.
+ */
+ if (input_len > input_position) {
+ uint64_t counter = chunk_counter + (uint64_t)chunks_array_len;
+ blake3_chunk_state_t chunk_state;
+ chunk_state_init(&chunk_state, key, flags);
+ chunk_state.chunk_counter = counter;
+ chunk_state_update(ops, &chunk_state, &input[input_position],
+ input_len - input_position);
+ output_t output = chunk_state_output(&chunk_state);
+ output_chaining_value(ops, &output, &out[chunks_array_len *
+ BLAKE3_OUT_LEN]);
+ return (chunks_array_len + 1);
+ } else {
+ return (chunks_array_len);
+ }
+}
+
+/*
+ * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time
+ * on a single thread. Write out the parent chaining values and return the
+ * number of parents hashed. (If there's an odd input chaining value left over,
+ * return it as an additional output.) These parents are never the root and
+ * never empty; those cases use a different codepath.
+ */
+static size_t compress_parents_parallel(const blake3_impl_ops_t *ops,
+ const uint8_t *child_chaining_values, size_t num_chaining_values,
+ const uint32_t key[8], uint8_t flags, uint8_t *out)
+{
+ const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2];
+ size_t parents_array_len = 0;
+
+ while (num_chaining_values - (2 * parents_array_len) >= 2) {
+ parents_array[parents_array_len] = &child_chaining_values[2 *
+ parents_array_len * BLAKE3_OUT_LEN];
+ parents_array_len += 1;
+ }
+
+ ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE,
+ flags | PARENT, 0, 0, out);
+
+ /* If there's an odd child left over, it becomes an output. */
+ if (num_chaining_values > 2 * parents_array_len) {
+ memcpy(&out[parents_array_len * BLAKE3_OUT_LEN],
+ &child_chaining_values[2 * parents_array_len *
+ BLAKE3_OUT_LEN], BLAKE3_OUT_LEN);
+ return (parents_array_len + 1);
+ } else {
+ return (parents_array_len);
+ }
+}
+
+/*
+ * The wide helper function returns (writes out) an array of chaining values
+ * and returns the length of that array. The number of chaining values returned
+ * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+ * if the input is shorter than that many chunks. The reason for maintaining a
+ * wide array of chaining values going back up the tree, is to allow the
+ * implementation to hash as many parents in parallel as possible.
+ *
+ * As a special case when the SIMD degree is 1, this function will still return
+ * at least 2 outputs. This guarantees that this function doesn't perform the
+ * root compression. (If it did, it would use the wrong flags, and also we
+ * wouldn't be able to implement exendable ouput.) Note that this function is
+ * not used when the whole input is only 1 chunk long; that's a different
+ * codepath.
+ *
+ * Why not just have the caller split the input on the first update(), instead
+ * of implementing this special rule? Because we don't want to limit SIMD or
+ * multi-threading parallelism for that update().
+ */
+static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t *out)
+{
+ /*
+ * Note that the single chunk case does *not* bump the SIMD degree up
+ * to 2 when it is 1. If this implementation adds multi-threading in
+ * the future, this gives us the option of multi-threading even the
+ * 2-chunk case, which can help performance on smaller platforms.
+ */
+ if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) {
+ return (compress_chunks_parallel(ops, input, input_len, key,
+ chunk_counter, flags, out));
+ }
+
+
+ /*
+ * With more than simd_degree chunks, we need to recurse. Start by
+ * dividing the input into left and right subtrees. (Note that this is
+ * only optimal as long as the SIMD degree is a power of 2. If we ever
+ * get a SIMD degree of 3 or something, we'll need a more complicated
+ * strategy.)
+ */
+ size_t left_input_len = left_len(input_len);
+ size_t right_input_len = input_len - left_input_len;
+ const uint8_t *right_input = &input[left_input_len];
+ uint64_t right_chunk_counter = chunk_counter +
+ (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN);
+
+ /*
+ * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2
+ * to account for the special case of returning 2 outputs when the
+ * SIMD degree is 1.
+ */
+ uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t degree = ops->degree;
+ if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) {
+
+ /*
+ * The special case: We always use a degree of at least two,
+ * to make sure there are two outputs. Except, as noted above,
+ * at the chunk level, where we allow degree=1. (Note that the
+ * 1-chunk-input case is a different codepath.)
+ */
+ degree = 2;
+ }
+ uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN];
+
+ /*
+ * Recurse! If this implementation adds multi-threading support in the
+ * future, this is where it will go.
+ */
+ size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len,
+ key, chunk_counter, flags, cv_array);
+ size_t right_n = blake3_compress_subtree_wide(ops, right_input,
+ right_input_len, key, right_chunk_counter, flags, right_cvs);
+
+ /*
+ * The special case again. If simd_degree=1, then we'll have left_n=1
+ * and right_n=1. Rather than compressing them into a single output,
+ * return them directly, to make sure we always have at least two
+ * outputs.
+ */
+ if (left_n == 1) {
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+ return (2);
+ }
+
+ /* Otherwise, do one layer of parent node compression. */
+ size_t num_chaining_values = left_n + right_n;
+ return compress_parents_parallel(ops, cv_array,
+ num_chaining_values, key, flags, out);
+}
+
+/*
+ * Hash a subtree with compress_subtree_wide(), and then condense the resulting
+ * list of chaining values down to a single parent node. Don't compress that
+ * last parent node, however. Instead, return its message bytes (the
+ * concatenated chaining values of its children). This is necessary when the
+ * first call to update() supplies a complete subtree, because the topmost
+ * parent node of that subtree could end up being the root. It's also necessary
+ * for extended output in the general case.
+ *
+ * As with compress_subtree_wide(), this function is not used on inputs of 1
+ * chunk or less. That's a different codepath.
+ */
+static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops,
+ const uint8_t *input, size_t input_len, const uint32_t key[8],
+ uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN])
+{
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
+ size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len,
+ key, chunk_counter, flags, cv_array);
+
+ /*
+ * If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+ * compress_subtree_wide() returns more than 2 chaining values. Condense
+ * them into 2 by forming parent nodes repeatedly.
+ */
+ uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
+ while (num_cvs > 2) {
+ num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key,
+ flags, out_array);
+ memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
+ }
+ memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
+}
+
+static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8],
+ uint8_t flags)
+{
+ memcpy(ctx->key, key, BLAKE3_KEY_LEN);
+ chunk_state_init(&ctx->chunk, key, flags);
+ ctx->cv_stack_len = 0;
+ ctx->ops = blake3_impl_get_ops();
+}
+
+/*
+ * As described in hasher_push_cv() below, we do "lazy merging", delaying
+ * merges until right before the next CV is about to be added. This is
+ * different from the reference implementation. Another difference is that we
+ * aren't always merging 1 chunk at a time. Instead, each CV might represent
+ * any power-of-two number of chunks, as long as the smaller-above-larger
+ * stack order is maintained. Instead of the "count the trailing 0-bits"
+ * algorithm described in the spec, we use a "count the total number of
+ * 1-bits" variant that doesn't require us to retain the subtree size of the
+ * CV on top of the stack. The principle is the same: each CV that should
+ * remain in the stack is represented by a 1-bit in the total number of chunks
+ * (or bytes) so far.
+ */
+static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len)
+{
+ size_t post_merge_stack_len = (size_t)popcnt(total_len);
+ while (ctx->cv_stack_len > post_merge_stack_len) {
+ uint8_t *parent_node =
+ &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN];
+ output_t output =
+ parent_output(parent_node, ctx->key, ctx->chunk.flags);
+ output_chaining_value(ctx->ops, &output, parent_node);
+ ctx->cv_stack_len -= 1;
+ }
+}
+
+/*
+ * In reference_impl.rs, we merge the new CV with existing CVs from the stack
+ * before pushing it. We can do that because we know more input is coming, so
+ * we know none of the merges are root.
+ *
+ * This setting is different. We want to feed as much input as possible to
+ * compress_subtree_wide(), without setting aside anything for the chunk_state.
+ * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once
+ * as a single subtree, if at all possible.
+ *
+ * This leads to two problems:
+ * 1) This 64 KiB input might be the only call that ever gets made to update.
+ * In this case, the root node of the 64 KiB subtree would be the root node
+ * of the whole tree, and it would need to be ROOT finalized. We can't
+ * compress it until we know.
+ * 2) This 64 KiB input might complete a larger tree, whose root node is
+ * similarly going to be the the root of the whole tree. For example, maybe
+ * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
+ * node at the root of the 256 KiB subtree until we know how to finalize it.
+ *
+ * The second problem is solved with "lazy merging". That is, when we're about
+ * to add a CV to the stack, we don't merge it with anything first, as the
+ * reference impl does. Instead we do merges using the *previous* CV that was
+ * added, which is sitting on top of the stack, and we put the new CV
+ * (unmerged) on top of the stack afterwards. This guarantees that we never
+ * merge the root node until finalize().
+ *
+ * Solving the first problem requires an additional tool,
+ * compress_subtree_to_parent_node(). That function always returns the top
+ * *two* chaining values of the subtree it's compressing. We then do lazy
+ * merging with each of them separately, so that the second CV will always
+ * remain unmerged. (That also helps us support extendable output when we're
+ * hashing an input all-at-once.)
+ */
+static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN],
+ uint64_t chunk_counter)
+{
+ hasher_merge_cv_stack(ctx, chunk_counter);
+ memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv,
+ BLAKE3_OUT_LEN);
+ ctx->cv_stack_len += 1;
+}
+
+void
+Blake3_Init(BLAKE3_CTX *ctx)
+{
+ hasher_init_base(ctx, BLAKE3_IV, 0);
+}
+
+void
+Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN])
+{
+ uint32_t key_words[8];
+ load_key_words(key, key_words);
+ hasher_init_base(ctx, key_words, KEYED_HASH);
+}
+
+static void
+Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len)
+{
+ /*
+ * Explicitly checking for zero avoids causing UB by passing a null
+ * pointer to memcpy. This comes up in practice with things like:
+ * std::vector<uint8_t> v;
+ * blake3_hasher_update(&hasher, v.data(), v.size());
+ */
+ if (input_len == 0) {
+ return;
+ }
+
+ const uint8_t *input_bytes = (const uint8_t *)input;
+
+ /*
+ * If we have some partial chunk bytes in the internal chunk_state, we
+ * need to finish that chunk first.
+ */
+ if (chunk_state_len(&ctx->chunk) > 0) {
+ size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk);
+ if (take > input_len) {
+ take = input_len;
+ }
+ chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take);
+ input_bytes += take;
+ input_len -= take;
+ /*
+ * If we've filled the current chunk and there's more coming,
+ * finalize this chunk and proceed. In this case we know it's
+ * not the root.
+ */
+ if (input_len > 0) {
+ output_t output = chunk_state_output(&ctx->chunk);
+ uint8_t chunk_cv[32];
+ output_chaining_value(ctx->ops, &output, chunk_cv);
+ hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter);
+ chunk_state_reset(&ctx->chunk, ctx->key,
+ ctx->chunk.chunk_counter + 1);
+ } else {
+ return;
+ }
+ }
+
+ /*
+ * Now the chunk_state is clear, and we have more input. If there's
+ * more than a single chunk (so, definitely not the root chunk), hash
+ * the largest whole subtree we can, with the full benefits of SIMD
+ * (and maybe in the future, multi-threading) parallelism. Two
+ * restrictions:
+ * - The subtree has to be a power-of-2 number of chunks. Only
+ * subtrees along the right edge can be incomplete, and we don't know
+ * where the right edge is going to be until we get to finalize().
+ * - The subtree must evenly divide the total number of chunks up
+ * until this point (if total is not 0). If the current incomplete
+ * subtree is only waiting for 1 more chunk, we can't hash a subtree
+ * of 4 chunks. We have to complete the current subtree first.
+ * Because we might need to break up the input to form powers of 2, or
+ * to evenly divide what we already have, this part runs in a loop.
+ */
+ while (input_len > BLAKE3_CHUNK_LEN) {
+ size_t subtree_len = round_down_to_power_of_2(input_len);
+ uint64_t count_so_far =
+ ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN;
+ /*
+ * Shrink the subtree_len until it evenly divides the count so
+ * far. We know that subtree_len itself is a power of 2, so we
+ * can use a bitmasking trick instead of an actual remainder
+ * operation. (Note that if the caller consistently passes
+ * power-of-2 inputs of the same size, as is hopefully
+ * typical, this loop condition will always fail, and
+ * subtree_len will always be the full length of the input.)
+ *
+ * An aside: We don't have to shrink subtree_len quite this
+ * much. For example, if count_so_far is 1, we could pass 2
+ * chunks to compress_subtree_to_parent_node. Since we'll get
+ * 2 CVs back, we'll still get the right answer in the end,
+ * and we might get to use 2-way SIMD parallelism. The problem
+ * with this optimization, is that it gets us stuck always
+ * hashing 2 chunks. The total number of chunks will remain
+ * odd, and we'll never graduate to higher degrees of
+ * parallelism. See
+ * https://github.com/BLAKE3-team/BLAKE3/issues/69.
+ */
+ while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) {
+ subtree_len /= 2;
+ }
+ /*
+ * The shrunken subtree_len might now be 1 chunk long. If so,
+ * hash that one chunk by itself. Otherwise, compress the
+ * subtree into a pair of CVs.
+ */
+ uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN;
+ if (subtree_len <= BLAKE3_CHUNK_LEN) {
+ blake3_chunk_state_t chunk_state;
+ chunk_state_init(&chunk_state, ctx->key,
+ ctx->chunk.flags);
+ chunk_state.chunk_counter = ctx->chunk.chunk_counter;
+ chunk_state_update(ctx->ops, &chunk_state, input_bytes,
+ subtree_len);
+ output_t output = chunk_state_output(&chunk_state);
+ uint8_t cv[BLAKE3_OUT_LEN];
+ output_chaining_value(ctx->ops, &output, cv);
+ hasher_push_cv(ctx, cv, chunk_state.chunk_counter);
+ } else {
+ /*
+ * This is the high-performance happy path, though
+ * getting here depends on the caller giving us a long
+ * enough input.
+ */
+ uint8_t cv_pair[2 * BLAKE3_OUT_LEN];
+ compress_subtree_to_parent_node(ctx->ops, input_bytes,
+ subtree_len, ctx->key, ctx-> chunk.chunk_counter,
+ ctx->chunk.flags, cv_pair);
+ hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter);
+ hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN],
+ ctx->chunk.chunk_counter + (subtree_chunks / 2));
+ }
+ ctx->chunk.chunk_counter += subtree_chunks;
+ input_bytes += subtree_len;
+ input_len -= subtree_len;
+ }
+
+ /*
+ * If there's any remaining input less than a full chunk, add it to
+ * the chunk state. In that case, also do a final merge loop to make
+ * sure the subtree stack doesn't contain any unmerged pairs. The
+ * remaining input means we know these merges are non-root. This merge
+ * loop isn't strictly necessary here, because hasher_push_chunk_cv
+ * already does its own merge loop, but it simplifies
+ * blake3_hasher_finalize below.
+ */
+ if (input_len > 0) {
+ chunk_state_update(ctx->ops, &ctx->chunk, input_bytes,
+ input_len);
+ hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter);
+ }
+}
+
+void
+Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo)
+{
+ size_t done = 0;
+ const uint8_t *data = input;
+ const size_t block_max = 1024 * 64;
+
+ /* max feed buffer to leave the stack size small */
+ while (todo != 0) {
+ size_t block = (todo >= block_max) ? block_max : todo;
+ Blake3_Update2(ctx, data + done, block);
+ done += block;
+ todo -= block;
+ }
+}
+
+void
+Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out)
+{
+ Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN);
+}
+
+void
+Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out,
+ size_t out_len)
+{
+ /*
+ * Explicitly checking for zero avoids causing UB by passing a null
+ * pointer to memcpy. This comes up in practice with things like:
+ * std::vector<uint8_t> v;
+ * blake3_hasher_finalize(&hasher, v.data(), v.size());
+ */
+ if (out_len == 0) {
+ return;
+ }
+ /* If the subtree stack is empty, then the current chunk is the root. */
+ if (ctx->cv_stack_len == 0) {
+ output_t output = chunk_state_output(&ctx->chunk);
+ output_root_bytes(ctx->ops, &output, seek, out, out_len);
+ return;
+ }
+ /*
+ * If there are any bytes in the chunk state, finalize that chunk and
+ * do a roll-up merge between that chunk hash and every subtree in the
+ * stack. In this case, the extra merge loop at the end of
+ * blake3_hasher_update guarantees that none of the subtrees in the
+ * stack need to be merged with each other first. Otherwise, if there
+ * are no bytes in the chunk state, then the top of the stack is a
+ * chunk hash, and we start the merge from that.
+ */
+ output_t output;
+ size_t cvs_remaining;
+ if (chunk_state_len(&ctx->chunk) > 0) {
+ cvs_remaining = ctx->cv_stack_len;
+ output = chunk_state_output(&ctx->chunk);
+ } else {
+ /* There are always at least 2 CVs in the stack in this case. */
+ cvs_remaining = ctx->cv_stack_len - 2;
+ output = parent_output(&ctx->cv_stack[cvs_remaining * 32],
+ ctx->key, ctx->chunk.flags);
+ }
+ while (cvs_remaining > 0) {
+ cvs_remaining -= 1;
+ uint8_t parent_block[BLAKE3_BLOCK_LEN];
+ memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32);
+ output_chaining_value(ctx->ops, &output, &parent_block[32]);
+ output = parent_output(parent_block, ctx->key,
+ ctx->chunk.flags);
+ }
+ output_root_bytes(ctx->ops, &output, seek, out, out_len);
+}
diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c
new file mode 100644
index 000000000..6ff9a845c
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_generic.c
@@ -0,0 +1,202 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include "blake3_impl.h"
+
+#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
+ uint32_t x, uint32_t y)
+{
+ state[a] = state[a] + state[b] + x;
+ state[d] = rotr32(state[d] ^ state[a], 16);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 12);
+ state[a] = state[a] + state[b] + y;
+ state[d] = rotr32(state[d] ^ state[a], 8);
+ state[c] = state[c] + state[d];
+ state[b] = rotr32(state[b] ^ state[c], 7);
+}
+
+static inline void round_fn(uint32_t state[16], const uint32_t *msg,
+ size_t round)
+{
+ /* Select the message schedule based on the round. */
+ const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round];
+
+ /* Mix the columns. */
+ g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
+ g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
+ g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
+ g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
+
+ /* Mix the rows. */
+ g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
+ g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
+ g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
+ g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
+}
+
+static inline void compress_pre(uint32_t state[16], const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ uint32_t block_words[16];
+ block_words[0] = load32(block + 4 * 0);
+ block_words[1] = load32(block + 4 * 1);
+ block_words[2] = load32(block + 4 * 2);
+ block_words[3] = load32(block + 4 * 3);
+ block_words[4] = load32(block + 4 * 4);
+ block_words[5] = load32(block + 4 * 5);
+ block_words[6] = load32(block + 4 * 6);
+ block_words[7] = load32(block + 4 * 7);
+ block_words[8] = load32(block + 4 * 8);
+ block_words[9] = load32(block + 4 * 9);
+ block_words[10] = load32(block + 4 * 10);
+ block_words[11] = load32(block + 4 * 11);
+ block_words[12] = load32(block + 4 * 12);
+ block_words[13] = load32(block + 4 * 13);
+ block_words[14] = load32(block + 4 * 14);
+ block_words[15] = load32(block + 4 * 15);
+
+ state[0] = cv[0];
+ state[1] = cv[1];
+ state[2] = cv[2];
+ state[3] = cv[3];
+ state[4] = cv[4];
+ state[5] = cv[5];
+ state[6] = cv[6];
+ state[7] = cv[7];
+ state[8] = BLAKE3_IV[0];
+ state[9] = BLAKE3_IV[1];
+ state[10] = BLAKE3_IV[2];
+ state[11] = BLAKE3_IV[3];
+ state[12] = counter_low(counter);
+ state[13] = counter_high(counter);
+ state[14] = (uint32_t)block_len;
+ state[15] = (uint32_t)flags;
+
+ round_fn(state, &block_words[0], 0);
+ round_fn(state, &block_words[0], 1);
+ round_fn(state, &block_words[0], 2);
+ round_fn(state, &block_words[0], 3);
+ round_fn(state, &block_words[0], 4);
+ round_fn(state, &block_words[0], 5);
+ round_fn(state, &block_words[0], 6);
+}
+
+static inline void blake3_compress_in_place_generic(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags)
+{
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+ cv[0] = state[0] ^ state[8];
+ cv[1] = state[1] ^ state[9];
+ cv[2] = state[2] ^ state[10];
+ cv[3] = state[3] ^ state[11];
+ cv[4] = state[4] ^ state[12];
+ cv[5] = state[5] ^ state[13];
+ cv[6] = state[6] ^ state[14];
+ cv[7] = state[7] ^ state[15];
+}
+
+static inline void hash_one_generic(const uint8_t *input, size_t blocks,
+ const uint32_t key[8], uint64_t counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
+{
+ uint32_t cv[8];
+ memcpy(cv, key, BLAKE3_KEY_LEN);
+ uint8_t block_flags = flags | flags_start;
+ while (blocks > 0) {
+ if (blocks == 1) {
+ block_flags |= flags_end;
+ }
+ blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN,
+ counter, block_flags);
+ input = &input[BLAKE3_BLOCK_LEN];
+ blocks -= 1;
+ block_flags = flags;
+ }
+ store_cv_words(out, cv);
+}
+
+static inline void blake3_compress_xof_generic(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64])
+{
+ uint32_t state[16];
+ compress_pre(state, cv, block, block_len, counter, flags);
+
+ store32(&out[0 * 4], state[0] ^ state[8]);
+ store32(&out[1 * 4], state[1] ^ state[9]);
+ store32(&out[2 * 4], state[2] ^ state[10]);
+ store32(&out[3 * 4], state[3] ^ state[11]);
+ store32(&out[4 * 4], state[4] ^ state[12]);
+ store32(&out[5 * 4], state[5] ^ state[13]);
+ store32(&out[6 * 4], state[6] ^ state[14]);
+ store32(&out[7 * 4], state[7] ^ state[15]);
+ store32(&out[8 * 4], state[8] ^ cv[0]);
+ store32(&out[9 * 4], state[9] ^ cv[1]);
+ store32(&out[10 * 4], state[10] ^ cv[2]);
+ store32(&out[11 * 4], state[11] ^ cv[3]);
+ store32(&out[12 * 4], state[12] ^ cv[4]);
+ store32(&out[13 * 4], state[13] ^ cv[5]);
+ store32(&out[14 * 4], state[14] ^ cv[6]);
+ store32(&out[15 * 4], state[15] ^ cv[7]);
+}
+
+static inline void blake3_hash_many_generic(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter,
+ boolean_t increment_counter, uint8_t flags, uint8_t flags_start,
+ uint8_t flags_end, uint8_t *out)
+{
+ while (num_inputs > 0) {
+ hash_one_generic(inputs[0], blocks, key, counter, flags,
+ flags_start, flags_end, out);
+ if (increment_counter) {
+ counter += 1;
+ }
+ inputs += 1;
+ num_inputs -= 1;
+ out = &out[BLAKE3_OUT_LEN];
+ }
+}
+
+static inline boolean_t blake3_is_generic_supported(void)
+{
+ return (B_TRUE);
+}
+
+const blake3_impl_ops_t blake3_generic_impl = {
+ .compress_in_place = blake3_compress_in_place_generic,
+ .compress_xof = blake3_compress_xof_generic,
+ .hash_many = blake3_hash_many_generic,
+ .is_supported = blake3_is_generic_supported,
+ .degree = 4,
+ .name = "generic"
+};
diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c
new file mode 100644
index 000000000..c3268ec13
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_impl.c
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/zio_checksum.h>
+
+#include "blake3_impl.h"
+
+static const blake3_impl_ops_t *const blake3_impls[] = {
+ &blake3_generic_impl,
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ &blake3_sse2_impl,
+#endif
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ &blake3_sse41_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+ &blake3_avx2_impl,
+#endif
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+ &blake3_avx512_impl,
+#endif
+};
+
+/* this pointer holds current ops for implementation */
+static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl;
+
+/* special implementation selections */
+#define IMPL_FASTEST (UINT32_MAX)
+#define IMPL_CYCLE (UINT32_MAX-1)
+#define IMPL_USER (UINT32_MAX-2)
+#define IMPL_PARAM (UINT32_MAX-3)
+
+#define IMPL_READ(i) (*(volatile uint32_t *) &(i))
+static uint32_t icp_blake3_impl = IMPL_FASTEST;
+
+#define BLAKE3_IMPL_NAME_MAX 16
+
+/* id of fastest implementation */
+static uint32_t blake3_fastest_id = 0;
+
+/* currently used id */
+static uint32_t blake3_current_id = 0;
+
+/* id of module parameter (-1 == unused) */
+static int blake3_param_id = -1;
+
+/* return number of supported implementations */
+int
+blake3_get_impl_count(void)
+{
+ static int impls = 0;
+ int i;
+
+ if (impls)
+ return (impls);
+
+ for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ impls++;
+ }
+
+ return (impls);
+}
+
+/* return id of selected implementation */
+int
+blake3_get_impl_id(void)
+{
+ return (blake3_current_id);
+}
+
+/* return name of selected implementation */
+const char *
+blake3_get_impl_name(void)
+{
+ return (blake3_selected_impl->name);
+}
+
+/* setup id as fastest implementation */
+void
+blake3_set_impl_fastest(uint32_t id)
+{
+ blake3_fastest_id = id;
+}
+
+/* set implementation by id */
+void
+blake3_set_impl_id(uint32_t id)
+{
+ int i, cid;
+
+ /* select fastest */
+ if (id == IMPL_FASTEST)
+ id = blake3_fastest_id;
+
+ /* select next or first */
+ if (id == IMPL_CYCLE)
+ id = (++blake3_current_id) % blake3_get_impl_count();
+
+ /* 0..N for the real impl */
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ if (cid == id) {
+ blake3_current_id = cid;
+ blake3_selected_impl = blake3_impls[i];
+ return;
+ }
+ cid++;
+ }
+}
+
+/* set implementation by name */
+int
+blake3_set_impl_name(const char *name)
+{
+ int i, cid;
+
+ if (strcmp(name, "fastest") == 0) {
+ atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST);
+ blake3_set_impl_id(IMPL_FASTEST);
+ return (0);
+ } else if (strcmp(name, "cycle") == 0) {
+ atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE);
+ blake3_set_impl_id(IMPL_CYCLE);
+ return (0);
+ }
+
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ if (strcmp(name, blake3_impls[i]->name) == 0) {
+ if (icp_blake3_impl == IMPL_PARAM) {
+ blake3_param_id = cid;
+ return (0);
+ }
+ blake3_selected_impl = blake3_impls[i];
+ blake3_current_id = cid;
+ return (0);
+ }
+ cid++;
+ }
+
+ return (-EINVAL);
+}
+
+/* setup implementation */
+void
+blake3_setup_impl(void)
+{
+ switch (IMPL_READ(icp_blake3_impl)) {
+ case IMPL_PARAM:
+ blake3_set_impl_id(blake3_param_id);
+ atomic_swap_32(&icp_blake3_impl, IMPL_USER);
+ break;
+ case IMPL_FASTEST:
+ blake3_set_impl_id(IMPL_FASTEST);
+ break;
+ case IMPL_CYCLE:
+ blake3_set_impl_id(IMPL_CYCLE);
+ break;
+ default:
+ blake3_set_impl_id(blake3_current_id);
+ break;
+ }
+}
+
+/* return selected implementation */
+const blake3_impl_ops_t *
+blake3_impl_get_ops(void)
+{
+ /* each call to ops will cycle */
+ if (icp_blake3_impl == IMPL_CYCLE)
+ blake3_set_impl_id(IMPL_CYCLE);
+
+ return (blake3_selected_impl);
+}
+
+#if defined(_KERNEL) && defined(__linux__)
+static int
+icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp)
+{
+ char req_name[BLAKE3_IMPL_NAME_MAX];
+ size_t i;
+
+ /* sanitize input */
+ i = strnlen(name, BLAKE3_IMPL_NAME_MAX);
+ if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX)
+ return (-EINVAL);
+
+ strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX);
+ while (i > 0 && isspace(req_name[i-1]))
+ i--;
+ req_name[i] = '\0';
+
+ atomic_swap_32(&icp_blake3_impl, IMPL_PARAM);
+ return (blake3_set_impl_name(req_name));
+}
+
+static int
+icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp)
+{
+ int i, cid, cnt = 0;
+ char *fmt;
+
+ /* cycling */
+ fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle ";
+ cnt += sprintf(buffer + cnt, fmt);
+
+ /* fastest one */
+ fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest ";
+ cnt += sprintf(buffer + cnt, fmt);
+
+ /* user selected */
+ for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) {
+ if (!blake3_impls[i]->is_supported()) continue;
+ fmt = (icp_blake3_impl == IMPL_USER &&
+ cid == blake3_current_id) ? "[%s] " : "%s ";
+ cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name);
+ cid++;
+ }
+
+ buffer[cnt] = 0;
+
+ return (cnt);
+}
+
+module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get,
+ NULL, 0644);
+MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation.");
+#endif
diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h
new file mode 100644
index 000000000..7b40cc4d3
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_impl.h
@@ -0,0 +1,213 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#ifndef BLAKE3_IMPL_H
+#define BLAKE3_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/blake3.h>
+#include <sys/simd.h>
+
+/*
+ * Methods used to define BLAKE3 assembler implementations
+ */
+typedef void (*blake3_compress_in_place_f)(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN],
+ uint8_t block_len, uint64_t counter,
+ uint8_t flags);
+
+typedef void (*blake3_compress_xof_f)(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+typedef boolean_t (*blake3_is_supported_f)(void);
+
+typedef struct blake3_impl_ops {
+ blake3_compress_in_place_f compress_in_place;
+ blake3_compress_xof_f compress_xof;
+ blake3_hash_many_f hash_many;
+ blake3_is_supported_f is_supported;
+ int degree;
+ const char *name;
+} blake3_impl_ops_t;
+
+/* Return selected BLAKE3 implementation ops */
+extern const blake3_impl_ops_t *blake3_impl_get_ops(void);
+
+extern const blake3_impl_ops_t blake3_generic_impl;
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+extern const blake3_impl_ops_t blake3_sse2_impl;
+#endif
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE4_1)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+extern const blake3_impl_ops_t blake3_sse41_impl;
+#endif
+
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+extern const blake3_impl_ops_t blake3_avx2_impl;
+#endif
+
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+extern const blake3_impl_ops_t blake3_avx512_impl;
+#endif
+
+#if defined(__x86_64)
+#define MAX_SIMD_DEGREE 16
+#else
+#define MAX_SIMD_DEGREE 4
+#endif
+
+#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
+
+static const uint32_t BLAKE3_IV[8] = {
+ 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
+ 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL};
+
+static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = {
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
+};
+
+/* Find index of the highest set bit */
+static inline unsigned int highest_one(uint64_t x) {
+#if defined(__GNUC__) || defined(__clang__)
+ return (63 ^ __builtin_clzll(x));
+#elif defined(_MSC_VER) && defined(IS_X86_64)
+ unsigned long index;
+ _BitScanReverse64(&index, x);
+ return (index);
+#elif defined(_MSC_VER) && defined(IS_X86_32)
+ if (x >> 32) {
+ unsigned long index;
+ _BitScanReverse(&index, x >> 32);
+ return (32 + index);
+ } else {
+ unsigned long index;
+ _BitScanReverse(&index, x);
+ return (index);
+ }
+#else
+ unsigned int c = 0;
+ if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
+ if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
+ if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
+ if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
+ if (x & 0x000000000000000cULL) { x >>= 2; c += 2; }
+ if (x & 0x0000000000000002ULL) { c += 1; }
+ return (c);
+#endif
+}
+
+/* Count the number of 1 bits. */
+static inline unsigned int popcnt(uint64_t x) {
+ unsigned int count = 0;
+
+ while (x != 0) {
+ count += 1;
+ x &= x - 1;
+ }
+
+ return (count);
+}
+
+/*
+ * Largest power of two less than or equal to x.
+ * As a special case, returns 1 when x is 0.
+ */
+static inline uint64_t round_down_to_power_of_2(uint64_t x) {
+ return (1ULL << highest_one(x | 1));
+}
+
+static inline uint32_t counter_low(uint64_t counter) {
+ return ((uint32_t)counter);
+}
+
+static inline uint32_t counter_high(uint64_t counter) {
+ return ((uint32_t)(counter >> 32));
+}
+
+static inline uint32_t load32(const void *src) {
+ const uint8_t *p = (const uint8_t *)src;
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
+}
+
+static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
+ uint32_t key_words[8]) {
+ key_words[0] = load32(&key[0 * 4]);
+ key_words[1] = load32(&key[1 * 4]);
+ key_words[2] = load32(&key[2 * 4]);
+ key_words[3] = load32(&key[3 * 4]);
+ key_words[4] = load32(&key[4 * 4]);
+ key_words[5] = load32(&key[5 * 4]);
+ key_words[6] = load32(&key[6 * 4]);
+ key_words[7] = load32(&key[7 * 4]);
+}
+
+static inline void store32(void *dst, uint32_t w) {
+ uint8_t *p = (uint8_t *)dst;
+ p[0] = (uint8_t)(w >> 0);
+ p[1] = (uint8_t)(w >> 8);
+ p[2] = (uint8_t)(w >> 16);
+ p[3] = (uint8_t)(w >> 24);
+}
+
+static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
+ store32(&bytes_out[0 * 4], cv_words[0]);
+ store32(&bytes_out[1 * 4], cv_words[1]);
+ store32(&bytes_out[2 * 4], cv_words[2]);
+ store32(&bytes_out[3 * 4], cv_words[3]);
+ store32(&bytes_out[4 * 4], cv_words[4]);
+ store32(&bytes_out[5 * 4], cv_words[5]);
+ store32(&bytes_out[6 * 4], cv_words[6]);
+ store32(&bytes_out[7 * 4], cv_words[7]);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_IMPL_H */
diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c
new file mode 100644
index 000000000..48715e212
--- /dev/null
+++ b/module/icp/algs/blake3/blake3_x86-64.c
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2021-2022 Tino Reichardt <[email protected]>
+ */
+
+#include "blake3_impl.h"
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_sse2(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_sse2(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_sse2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_sse2_supported(void)
+{
+#if defined(__x86_64)
+ return (kfpu_allowed() && zfs_sse2_available());
+#elif defined(__PPC64__)
+ return (kfpu_allowed() && zfs_vsx_available());
+#else
+ return (kfpu_allowed());
+#endif
+}
+
+const blake3_impl_ops_t blake3_sse2_impl = {
+ .compress_in_place = blake3_compress_in_place_sse2,
+ .compress_xof = blake3_compress_xof_sse2,
+ .hash_many = blake3_hash_many_sse2,
+ .is_supported = blake3_is_sse2_supported,
+ .degree = 4,
+ .name = "sse2"
+};
+#endif
+
+#if defined(__aarch64__) || \
+ (defined(__x86_64) && defined(HAVE_SSE2)) || \
+ (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+
+extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_sse41(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_sse41(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_sse41(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_sse41_supported(void)
+{
+#if defined(__x86_64)
+ return (kfpu_allowed() && zfs_sse4_1_available());
+#elif defined(__PPC64__)
+ return (kfpu_allowed() && zfs_vsx_available());
+#else
+ return (kfpu_allowed());
+#endif
+}
+
+const blake3_impl_ops_t blake3_sse41_impl = {
+ .compress_in_place = blake3_compress_in_place_sse41,
+ .compress_xof = blake3_compress_xof_sse41,
+ .hash_many = blake3_hash_many_sse41,
+ .is_supported = blake3_is_sse41_supported,
+ .degree = 4,
+ .name = "sse41"
+};
+#endif
+
+#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2)
+extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_hash_many_avx2(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_avx2_supported(void)
+{
+ return (kfpu_allowed() && zfs_sse4_1_available() &&
+ zfs_avx2_available());
+}
+
+const blake3_impl_ops_t blake3_avx2_impl = {
+ .compress_in_place = blake3_compress_in_place_sse41,
+ .compress_xof = blake3_compress_xof_sse41,
+ .hash_many = blake3_hash_many_avx2,
+ .is_supported = blake3_is_avx2_supported,
+ .degree = 8,
+ .name = "avx2"
+};
+#endif
+
+#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags);
+
+extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]);
+
+extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
+
+static void blake3_compress_in_place_avx512(uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags) {
+ kfpu_begin();
+ zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter,
+ flags);
+ kfpu_end();
+}
+
+static void blake3_compress_xof_avx512(const uint32_t cv[8],
+ const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len,
+ uint64_t counter, uint8_t flags, uint8_t out[64]) {
+ kfpu_begin();
+ zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags,
+ out);
+ kfpu_end();
+}
+
+static void blake3_hash_many_avx512(const uint8_t * const *inputs,
+ size_t num_inputs, size_t blocks, const uint32_t key[8],
+ uint64_t counter, boolean_t increment_counter, uint8_t flags,
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+ kfpu_begin();
+ zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
+ increment_counter, flags, flags_start, flags_end, out);
+ kfpu_end();
+}
+
+static boolean_t blake3_is_avx512_supported(void)
+{
+ return (kfpu_allowed() && zfs_avx512f_available() &&
+ zfs_avx512vl_available());
+}
+
+const blake3_impl_ops_t blake3_avx512_impl = {
+ .compress_in_place = blake3_compress_in_place_avx512,
+ .compress_xof = blake3_compress_xof_avx512,
+ .hash_many = blake3_hash_many_avx512,
+ .is_supported = blake3_is_avx512_supported,
+ .degree = 16,
+ .name = "avx512"
+};
+#endif
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
new file mode 100644
index 000000000..59a4d9afd
--- /dev/null
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -0,0 +1,2450 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE2 -> ARMv8-A
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if defined(__aarch64__)
+ .text
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI0_1:
+ .xword 0
+ .xword -4294967296
+.LCPI0_2:
+ .xword -1
+ .xword 4294967295
+ .text
+ .globl zfs_blake3_compress_in_place_sse2
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse2,@function
+zfs_blake3_compress_in_place_sse2:
+ .cfi_startproc
+ ldp q3, q2, [x0]
+ ldp q5, q6, [x1]
+ add x10, x1, #32
+ lsr x11, x3, #32
+ fmov s4, w3
+ ld2 { v17.4s, v18.4s }, [x10]
+ adrp x10, .LCPI0_2
+ and w8, w2, #0xff
+ mov v4.s[1], w11
+ ldr q1, [x10, :lo12:.LCPI0_2]
+ and w9, w4, #0xff
+ adrp x12, .LCPI0_0
+ mov v4.s[2], w8
+ uzp1 v19.4s, v5.4s, v6.4s
+ add v3.4s, v2.4s, v3.4s
+ ldr q7, [x12, :lo12:.LCPI0_0]
+ mov v4.s[3], w9
+ add v3.4s, v3.4s, v19.4s
+ uzp2 v5.4s, v5.4s, v6.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ uzp1 v6.4s, v19.4s, v19.4s
+ ext v22.16b, v19.16b, v19.16b, #12
+ eor v4.16b, v3.16b, v4.16b
+ ext v20.16b, v17.16b, v17.16b, #12
+ ext v6.16b, v6.16b, v19.16b, #8
+ ext v19.16b, v19.16b, v22.16b, #12
+ zip1 v22.2d, v21.2d, v5.2d
+ rev32 v24.8h, v4.8h
+ mov v4.16b, v1.16b
+ zip2 v23.4s, v5.4s, v21.4s
+ uzp2 v6.4s, v6.4s, v5.4s
+ bsl v4.16b, v22.16b, v20.16b
+ add v3.4s, v3.4s, v5.4s
+ zip1 v5.4s, v23.4s, v20.4s
+ zip1 v22.4s, v20.4s, v23.4s
+ add v23.4s, v24.4s, v7.4s
+ ext v7.16b, v6.16b, v6.16b, #4
+ ext v25.16b, v4.16b, v4.16b, #12
+ ext v5.16b, v22.16b, v5.16b, #8
+ eor v2.16b, v23.16b, v2.16b
+ uzp1 v4.4s, v4.4s, v25.4s
+ uzp1 v22.4s, v7.4s, v7.4s
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v22.16b, v22.16b, v7.16b, #8
+ ext v7.16b, v7.16b, v25.16b, #12
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ add v3.4s, v3.4s, v2.4s
+ eor v24.16b, v3.16b, v24.16b
+ add v3.4s, v3.4s, v17.4s
+ ushr v17.4s, v24.4s, #8
+ shl v18.4s, v24.4s, #24
+ orr v17.16b, v18.16b, v17.16b
+ add v18.4s, v17.4s, v23.4s
+ eor v2.16b, v18.16b, v2.16b
+ ushr v23.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v3.16b, v3.16b, v3.16b, #12
+ orr v2.16b, v2.16b, v23.16b
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v3.4s, v2.4s, v3.4s
+ adrp x11, .LCPI0_1
+ eor v17.16b, v3.16b, v17.16b
+ ldr q16, [x11, :lo12:.LCPI0_1]
+ ext v18.16b, v18.16b, v18.16b, #4
+ rev32 v24.8h, v17.8h
+ movi v0.2d, #0xffffffff00000000
+ add v23.4s, v3.4s, v21.4s
+ mov v21.s[1], v20.s[2]
+ add v20.4s, v18.4s, v24.4s
+ bit v19.16b, v21.16b, v0.16b
+ eor v3.16b, v20.16b, v2.16b
+ uzp2 v2.4s, v22.4s, v19.4s
+ zip1 v17.2d, v5.2d, v19.2d
+ zip2 v18.4s, v19.4s, v5.4s
+ ushr v21.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ ext v22.16b, v2.16b, v2.16b, #4
+ bsl v16.16b, v4.16b, v17.16b
+ zip1 v17.4s, v18.4s, v4.4s
+ zip1 v18.4s, v4.4s, v18.4s
+ orr v21.16b, v3.16b, v21.16b
+ ext v25.16b, v16.16b, v16.16b, #12
+ ext v3.16b, v18.16b, v17.16b, #8
+ uzp1 v18.4s, v22.4s, v22.4s
+ ext v26.16b, v22.16b, v22.16b, #12
+ add v23.4s, v23.4s, v21.4s
+ uzp1 v17.4s, v16.4s, v25.4s
+ ext v16.16b, v18.16b, v22.16b, #8
+ ext v18.16b, v22.16b, v26.16b, #12
+ eor v22.16b, v23.16b, v24.16b
+ add v6.4s, v23.4s, v6.4s
+ ushr v23.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v23.16b
+ add v20.4s, v22.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v23.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v21.16b, v21.16b, v23.16b
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v21.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v20.16b, v20.16b, v20.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ rev32 v19.8h, v22.8h
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v22.4s, v21.4s, #12
+ shl v21.4s, v21.4s, #20
+ orr v21.16b, v21.16b, v22.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ushr v22.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v22.16b
+ add v20.4s, v19.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v22.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v21.16b, v21.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ext v20.16b, v20.16b, v20.16b, #4
+ rev32 v19.8h, v19.8h
+ add v20.4s, v20.4s, v19.4s
+ add v6.4s, v6.4s, v5.4s
+ mov v5.s[1], v4.s[2]
+ eor v4.16b, v20.16b, v21.16b
+ ushr v21.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v21.16b, v4.16b, v21.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ add v2.4s, v6.4s, v2.4s
+ ushr v6.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v6.4s, v20.4s
+ eor v20.16b, v19.16b, v21.16b
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v20.4s, v2.4s
+ eor v6.16b, v2.16b, v6.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v6.4s
+ mov v22.16b, v0.16b
+ eor v20.16b, v19.16b, v20.16b
+ bsl v22.16b, v5.16b, v7.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ add v2.4s, v2.4s, v22.4s
+ orr v20.16b, v20.16b, v21.16b
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ ushr v21.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v6.16b, v6.16b, v21.16b
+ add v19.4s, v6.4s, v19.4s
+ eor v20.16b, v19.16b, v20.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v2.4s, v2.4s, v17.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ uzp2 v5.4s, v16.4s, v22.4s
+ zip1 v7.2d, v3.2d, v22.2d
+ zip2 v16.4s, v22.4s, v3.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ rev32 v22.8h, v6.8h
+ ext v23.16b, v5.16b, v5.16b, #4
+ bif v7.16b, v17.16b, v1.16b
+ zip1 v24.4s, v16.4s, v17.4s
+ zip1 v16.4s, v17.4s, v16.4s
+ add v21.4s, v2.4s, v3.4s
+ mov v3.s[1], v17.s[2]
+ add v17.4s, v19.4s, v22.4s
+ mov v19.16b, v0.16b
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v4.16b, v16.16b, v24.16b, #8
+ uzp1 v16.4s, v23.4s, v23.4s
+ bsl v19.16b, v3.16b, v18.16b
+ eor v2.16b, v17.16b, v20.16b
+ uzp1 v7.4s, v7.4s, v25.4s
+ ext v25.16b, v16.16b, v23.16b, #8
+ zip1 v3.2d, v4.2d, v19.2d
+ ushr v20.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp2 v6.4s, v25.4s, v19.4s
+ zip2 v18.4s, v19.4s, v4.4s
+ bif v3.16b, v7.16b, v1.16b
+ orr v20.16b, v2.16b, v20.16b
+ ext v16.16b, v23.16b, v24.16b, #12
+ ext v23.16b, v6.16b, v6.16b, #4
+ zip1 v24.4s, v18.4s, v7.4s
+ zip1 v18.4s, v7.4s, v18.4s
+ ext v25.16b, v3.16b, v3.16b, #12
+ add v21.4s, v21.4s, v20.4s
+ ext v2.16b, v18.16b, v24.16b, #8
+ uzp1 v18.4s, v23.4s, v23.4s
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp1 v3.4s, v3.4s, v25.4s
+ eor v22.16b, v21.16b, v22.16b
+ ext v25.16b, v18.16b, v23.16b, #8
+ dup v18.4s, v2.s[3]
+ ext v23.16b, v23.16b, v24.16b, #12
+ add v5.4s, v21.4s, v5.4s
+ trn1 v21.4s, v3.4s, v3.4s
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ ext v18.16b, v21.16b, v18.16b, #8
+ orr v21.16b, v22.16b, v24.16b
+ add v17.4s, v21.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v22.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v20.16b, v22.16b
+ ext v21.16b, v21.16b, v21.16b, #8
+ add v5.4s, v20.4s, v5.4s
+ eor v21.16b, v5.16b, v21.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v5.4s, v5.4s, v19.4s
+ rev32 v19.8h, v21.8h
+ add v17.4s, v17.4s, v19.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v21.16b
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ushr v21.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v21.16b
+ add v17.4s, v19.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v5.4s, v5.4s, v7.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ rev32 v22.8h, v19.8h
+ add v21.4s, v5.4s, v4.4s
+ mov v4.s[1], v7.s[2]
+ add v19.4s, v17.4s, v22.4s
+ bit v16.16b, v4.16b, v0.16b
+ eor v5.16b, v19.16b, v20.16b
+ uzp2 v4.4s, v25.4s, v16.4s
+ zip1 v7.2d, v2.2d, v16.2d
+ zip2 v17.4s, v16.4s, v2.4s
+ ushr v20.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v24.16b, v4.16b, v4.16b, #4
+ bif v7.16b, v3.16b, v1.16b
+ zip1 v25.4s, v17.4s, v3.4s
+ zip1 v17.4s, v3.4s, v17.4s
+ orr v20.16b, v5.16b, v20.16b
+ ext v26.16b, v7.16b, v7.16b, #12
+ ext v5.16b, v17.16b, v25.16b, #8
+ uzp1 v17.4s, v24.4s, v24.4s
+ ext v25.16b, v24.16b, v24.16b, #12
+ bit v23.16b, v18.16b, v0.16b
+ add v21.4s, v21.4s, v20.4s
+ uzp1 v7.4s, v7.4s, v26.4s
+ ext v26.16b, v17.16b, v24.16b, #8
+ ext v17.16b, v24.16b, v25.16b, #12
+ eor v22.16b, v21.16b, v22.16b
+ add v6.4s, v21.4s, v6.4s
+ zip1 v21.2d, v5.2d, v23.2d
+ zip2 v24.4s, v23.4s, v5.4s
+ bif v21.16b, v7.16b, v1.16b
+ zip1 v1.4s, v24.4s, v7.4s
+ zip1 v24.4s, v7.4s, v24.4s
+ ext v1.16b, v24.16b, v1.16b, #8
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v24.16b
+ add v19.4s, v22.4s, v19.4s
+ ext v24.16b, v21.16b, v21.16b, #12
+ eor v20.16b, v19.16b, v20.16b
+ uzp1 v21.4s, v21.4s, v24.4s
+ ushr v24.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v20.16b, v20.16b, v24.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v20.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v6.4s, v6.4s, v16.4s
+ rev32 v16.8h, v22.8h
+ add v19.4s, v19.4s, v16.4s
+ eor v20.16b, v19.16b, v20.16b
+ ushr v22.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v22.16b
+ add v6.4s, v6.4s, v20.4s
+ eor v16.16b, v6.16b, v16.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v6.4s, v3.4s
+ ushr v6.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v6.16b, v16.16b, v6.16b
+ add v16.4s, v6.4s, v19.4s
+ eor v19.16b, v16.16b, v20.16b
+ ushr v20.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v20.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v3.4s, v3.4s, v19.4s
+ eor v6.16b, v3.16b, v6.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ add v2.4s, v3.4s, v2.4s
+ rev32 v3.8h, v6.8h
+ add v6.4s, v16.4s, v3.4s
+ eor v16.16b, v6.16b, v19.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ orr v16.16b, v16.16b, v19.16b
+ add v2.4s, v2.4s, v16.4s
+ eor v3.16b, v2.16b, v3.16b
+ add v2.4s, v2.4s, v4.4s
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v4.16b
+ add v4.4s, v3.4s, v6.4s
+ eor v6.16b, v4.16b, v16.16b
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v6.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v6.16b, v4.16b, v6.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ add v2.4s, v2.4s, v23.4s
+ orr v6.16b, v6.16b, v16.16b
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ add v4.4s, v3.4s, v4.4s
+ eor v6.16b, v4.16b, v6.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v2.4s, v2.4s, v7.4s
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ rev32 v3.8h, v3.8h
+ add v2.4s, v2.4s, v5.4s
+ mov v5.s[1], v7.s[2]
+ add v4.4s, v4.4s, v3.4s
+ bsl v0.16b, v5.16b, v17.16b
+ eor v5.16b, v4.16b, v6.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v6.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v6.16b
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v18.4s, v26.4s, v18.4s
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v18.4s
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v5.16b, v5.16b, v6.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v5.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v0.4s, v2.4s, v0.4s
+ rev32 v2.8h, v3.8h
+ add v3.4s, v4.4s, v2.4s
+ eor v4.16b, v3.16b, v5.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v4.16b, v4.16b, v5.16b
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ushr v5.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v2.16b, v2.16b, v5.16b
+ add v3.4s, v2.4s, v3.4s
+ eor v4.16b, v3.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #12
+ ushr v5.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v0.4s, v0.4s, v21.4s
+ orr v4.16b, v4.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ add v0.4s, v0.4s, v1.4s
+ rev32 v1.8h, v2.8h
+ add v2.4s, v3.4s, v1.4s
+ eor v3.16b, v2.16b, v4.16b
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v1.16b, v0.16b, v1.16b
+ ushr v4.4s, v1.4s, #8
+ shl v1.4s, v1.4s, #24
+ orr v1.16b, v1.16b, v4.16b
+ add v2.4s, v1.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v1.16b, v1.16b, v1.16b, #8
+ eor v0.16b, v2.16b, v0.16b
+ orr v2.16b, v3.16b, v4.16b
+ eor v1.16b, v2.16b, v1.16b
+ stp q0, q1, [x0]
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI1_1:
+ .xword 0
+ .xword -4294967296
+.LCPI1_2:
+ .xword -1
+ .xword 4294967295
+ .text
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+ .cfi_startproc
+ ldp q3, q2, [x0]
+ ldp q5, q6, [x1]
+ add x10, x1, #32
+ lsr x11, x3, #32
+ fmov s4, w3
+ ld2 { v17.4s, v18.4s }, [x10]
+ adrp x10, .LCPI1_2
+ and w8, w2, #0xff
+ mov v4.s[1], w11
+ ldr q1, [x10, :lo12:.LCPI1_2]
+ and w9, w4, #0xff
+ adrp x12, .LCPI1_0
+ mov v4.s[2], w8
+ uzp1 v19.4s, v5.4s, v6.4s
+ add v3.4s, v2.4s, v3.4s
+ ldr q7, [x12, :lo12:.LCPI1_0]
+ mov v4.s[3], w9
+ add v3.4s, v3.4s, v19.4s
+ uzp2 v5.4s, v5.4s, v6.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ uzp1 v6.4s, v19.4s, v19.4s
+ ext v22.16b, v19.16b, v19.16b, #12
+ eor v4.16b, v3.16b, v4.16b
+ ext v20.16b, v17.16b, v17.16b, #12
+ ext v6.16b, v6.16b, v19.16b, #8
+ ext v19.16b, v19.16b, v22.16b, #12
+ zip1 v22.2d, v21.2d, v5.2d
+ rev32 v24.8h, v4.8h
+ mov v4.16b, v1.16b
+ zip2 v23.4s, v5.4s, v21.4s
+ uzp2 v6.4s, v6.4s, v5.4s
+ bsl v4.16b, v22.16b, v20.16b
+ add v3.4s, v3.4s, v5.4s
+ zip1 v5.4s, v23.4s, v20.4s
+ zip1 v22.4s, v20.4s, v23.4s
+ add v23.4s, v24.4s, v7.4s
+ ext v7.16b, v6.16b, v6.16b, #4
+ ext v25.16b, v4.16b, v4.16b, #12
+ ext v5.16b, v22.16b, v5.16b, #8
+ eor v2.16b, v23.16b, v2.16b
+ uzp1 v4.4s, v4.4s, v25.4s
+ uzp1 v22.4s, v7.4s, v7.4s
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v22.16b, v22.16b, v7.16b, #8
+ ext v7.16b, v7.16b, v25.16b, #12
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ add v3.4s, v3.4s, v2.4s
+ eor v24.16b, v3.16b, v24.16b
+ add v3.4s, v3.4s, v17.4s
+ ushr v17.4s, v24.4s, #8
+ shl v18.4s, v24.4s, #24
+ orr v17.16b, v18.16b, v17.16b
+ add v18.4s, v17.4s, v23.4s
+ eor v2.16b, v18.16b, v2.16b
+ ushr v23.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v3.16b, v3.16b, v3.16b, #12
+ orr v2.16b, v2.16b, v23.16b
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v3.4s, v2.4s, v3.4s
+ adrp x11, .LCPI1_1
+ eor v17.16b, v3.16b, v17.16b
+ ldr q16, [x11, :lo12:.LCPI1_1]
+ ext v18.16b, v18.16b, v18.16b, #4
+ rev32 v24.8h, v17.8h
+ movi v0.2d, #0xffffffff00000000
+ add v23.4s, v3.4s, v21.4s
+ mov v21.s[1], v20.s[2]
+ add v20.4s, v18.4s, v24.4s
+ bit v19.16b, v21.16b, v0.16b
+ eor v3.16b, v20.16b, v2.16b
+ uzp2 v2.4s, v22.4s, v19.4s
+ zip1 v17.2d, v5.2d, v19.2d
+ zip2 v18.4s, v19.4s, v5.4s
+ ushr v21.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ ext v22.16b, v2.16b, v2.16b, #4
+ bsl v16.16b, v4.16b, v17.16b
+ zip1 v17.4s, v18.4s, v4.4s
+ zip1 v18.4s, v4.4s, v18.4s
+ orr v21.16b, v3.16b, v21.16b
+ ext v25.16b, v16.16b, v16.16b, #12
+ ext v3.16b, v18.16b, v17.16b, #8
+ uzp1 v18.4s, v22.4s, v22.4s
+ ext v26.16b, v22.16b, v22.16b, #12
+ add v23.4s, v23.4s, v21.4s
+ uzp1 v17.4s, v16.4s, v25.4s
+ ext v16.16b, v18.16b, v22.16b, #8
+ ext v18.16b, v22.16b, v26.16b, #12
+ eor v22.16b, v23.16b, v24.16b
+ add v6.4s, v23.4s, v6.4s
+ ushr v23.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v23.16b
+ add v20.4s, v22.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v23.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v21.16b, v21.16b, v23.16b
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v21.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v20.16b, v20.16b, v20.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ rev32 v19.8h, v22.8h
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v20.16b, v21.16b
+ ushr v22.4s, v21.4s, #12
+ shl v21.4s, v21.4s, #20
+ orr v21.16b, v21.16b, v22.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ushr v22.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v22.16b
+ add v20.4s, v19.4s, v20.4s
+ eor v21.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v22.4s, v21.4s, #7
+ shl v21.4s, v21.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v21.16b, v21.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ ext v20.16b, v20.16b, v20.16b, #4
+ rev32 v19.8h, v19.8h
+ add v20.4s, v20.4s, v19.4s
+ add v6.4s, v6.4s, v5.4s
+ mov v5.s[1], v4.s[2]
+ eor v4.16b, v20.16b, v21.16b
+ ushr v21.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v21.16b, v4.16b, v21.16b
+ add v6.4s, v6.4s, v21.4s
+ eor v19.16b, v6.16b, v19.16b
+ add v2.4s, v6.4s, v2.4s
+ ushr v6.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v6.4s, v20.4s
+ eor v20.16b, v19.16b, v21.16b
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v20.4s, v2.4s
+ eor v6.16b, v2.16b, v6.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v6.4s
+ mov v22.16b, v0.16b
+ eor v20.16b, v19.16b, v20.16b
+ bsl v22.16b, v5.16b, v7.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ add v2.4s, v2.4s, v22.4s
+ orr v20.16b, v20.16b, v21.16b
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ ushr v21.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v6.16b, v6.16b, v21.16b
+ add v19.4s, v6.4s, v19.4s
+ eor v20.16b, v19.16b, v20.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v2.4s, v2.4s, v17.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v2.4s, v2.4s, v20.4s
+ eor v6.16b, v2.16b, v6.16b
+ uzp2 v5.4s, v16.4s, v22.4s
+ zip1 v7.2d, v3.2d, v22.2d
+ zip2 v16.4s, v22.4s, v3.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ rev32 v22.8h, v6.8h
+ ext v23.16b, v5.16b, v5.16b, #4
+ bif v7.16b, v17.16b, v1.16b
+ zip1 v24.4s, v16.4s, v17.4s
+ zip1 v16.4s, v17.4s, v16.4s
+ add v21.4s, v2.4s, v3.4s
+ mov v3.s[1], v17.s[2]
+ add v17.4s, v19.4s, v22.4s
+ mov v19.16b, v0.16b
+ ext v25.16b, v7.16b, v7.16b, #12
+ ext v4.16b, v16.16b, v24.16b, #8
+ uzp1 v16.4s, v23.4s, v23.4s
+ bsl v19.16b, v3.16b, v18.16b
+ eor v2.16b, v17.16b, v20.16b
+ uzp1 v7.4s, v7.4s, v25.4s
+ ext v25.16b, v16.16b, v23.16b, #8
+ zip1 v3.2d, v4.2d, v19.2d
+ ushr v20.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp2 v6.4s, v25.4s, v19.4s
+ zip2 v18.4s, v19.4s, v4.4s
+ bif v3.16b, v7.16b, v1.16b
+ orr v20.16b, v2.16b, v20.16b
+ ext v16.16b, v23.16b, v24.16b, #12
+ ext v23.16b, v6.16b, v6.16b, #4
+ zip1 v24.4s, v18.4s, v7.4s
+ zip1 v18.4s, v7.4s, v18.4s
+ ext v25.16b, v3.16b, v3.16b, #12
+ add v21.4s, v21.4s, v20.4s
+ ext v2.16b, v18.16b, v24.16b, #8
+ uzp1 v18.4s, v23.4s, v23.4s
+ ext v24.16b, v23.16b, v23.16b, #12
+ uzp1 v3.4s, v3.4s, v25.4s
+ eor v22.16b, v21.16b, v22.16b
+ ext v25.16b, v18.16b, v23.16b, #8
+ dup v18.4s, v2.s[3]
+ ext v23.16b, v23.16b, v24.16b, #12
+ add v5.4s, v21.4s, v5.4s
+ trn1 v21.4s, v3.4s, v3.4s
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ ext v18.16b, v21.16b, v18.16b, #8
+ orr v21.16b, v22.16b, v24.16b
+ add v17.4s, v21.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v22.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v20.16b, v22.16b
+ ext v21.16b, v21.16b, v21.16b, #8
+ add v5.4s, v20.4s, v5.4s
+ eor v21.16b, v5.16b, v21.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ add v5.4s, v5.4s, v19.4s
+ rev32 v19.8h, v21.8h
+ add v17.4s, v17.4s, v19.4s
+ eor v20.16b, v17.16b, v20.16b
+ ushr v21.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v21.16b
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ushr v21.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v21.16b
+ add v17.4s, v19.4s, v17.4s
+ eor v20.16b, v17.16b, v20.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v21.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ add v5.4s, v5.4s, v7.4s
+ orr v20.16b, v20.16b, v21.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v5.4s, v5.4s, v20.4s
+ eor v19.16b, v5.16b, v19.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ rev32 v22.8h, v19.8h
+ add v21.4s, v5.4s, v4.4s
+ mov v4.s[1], v7.s[2]
+ add v19.4s, v17.4s, v22.4s
+ bit v16.16b, v4.16b, v0.16b
+ eor v5.16b, v19.16b, v20.16b
+ uzp2 v4.4s, v25.4s, v16.4s
+ zip1 v7.2d, v2.2d, v16.2d
+ zip2 v17.4s, v16.4s, v2.4s
+ ushr v20.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v24.16b, v4.16b, v4.16b, #4
+ bif v7.16b, v3.16b, v1.16b
+ zip1 v25.4s, v17.4s, v3.4s
+ zip1 v17.4s, v3.4s, v17.4s
+ orr v20.16b, v5.16b, v20.16b
+ ext v26.16b, v7.16b, v7.16b, #12
+ ext v5.16b, v17.16b, v25.16b, #8
+ uzp1 v17.4s, v24.4s, v24.4s
+ ext v25.16b, v24.16b, v24.16b, #12
+ bit v23.16b, v18.16b, v0.16b
+ add v21.4s, v21.4s, v20.4s
+ uzp1 v7.4s, v7.4s, v26.4s
+ ext v26.16b, v17.16b, v24.16b, #8
+ ext v17.16b, v24.16b, v25.16b, #12
+ eor v22.16b, v21.16b, v22.16b
+ add v6.4s, v21.4s, v6.4s
+ zip1 v21.2d, v5.2d, v23.2d
+ zip2 v24.4s, v23.4s, v5.4s
+ bif v21.16b, v7.16b, v1.16b
+ zip1 v1.4s, v24.4s, v7.4s
+ zip1 v24.4s, v7.4s, v24.4s
+ ext v1.16b, v24.16b, v1.16b, #8
+ ushr v24.4s, v22.4s, #8
+ shl v22.4s, v22.4s, #24
+ orr v22.16b, v22.16b, v24.16b
+ add v19.4s, v22.4s, v19.4s
+ ext v24.16b, v21.16b, v21.16b, #12
+ eor v20.16b, v19.16b, v20.16b
+ uzp1 v21.4s, v21.4s, v24.4s
+ ushr v24.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v20.16b, v20.16b, v24.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v22.16b, v22.16b, v22.16b, #8
+ add v6.4s, v20.4s, v6.4s
+ eor v22.16b, v6.16b, v22.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ add v6.4s, v6.4s, v16.4s
+ rev32 v16.8h, v22.8h
+ add v19.4s, v19.4s, v16.4s
+ eor v20.16b, v19.16b, v20.16b
+ ushr v22.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v20.16b, v20.16b, v22.16b
+ add v6.4s, v6.4s, v20.4s
+ eor v16.16b, v6.16b, v16.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v6.4s, v3.4s
+ ushr v6.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v6.16b, v16.16b, v6.16b
+ add v16.4s, v6.4s, v19.4s
+ eor v19.16b, v16.16b, v20.16b
+ ushr v20.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v20.16b
+ ext v6.16b, v6.16b, v6.16b, #8
+ add v3.4s, v3.4s, v19.4s
+ eor v6.16b, v3.16b, v6.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ add v2.4s, v3.4s, v2.4s
+ rev32 v3.8h, v6.8h
+ add v6.4s, v16.4s, v3.4s
+ eor v16.16b, v6.16b, v19.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ orr v16.16b, v16.16b, v19.16b
+ add v2.4s, v2.4s, v16.4s
+ eor v3.16b, v2.16b, v3.16b
+ add v2.4s, v2.4s, v4.4s
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v4.16b
+ add v4.4s, v3.4s, v6.4s
+ eor v6.16b, v4.16b, v16.16b
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v6.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v6.16b, v4.16b, v6.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ add v2.4s, v2.4s, v23.4s
+ orr v6.16b, v6.16b, v16.16b
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ add v4.4s, v3.4s, v4.4s
+ eor v6.16b, v4.16b, v6.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v2.4s, v2.4s, v7.4s
+ orr v6.16b, v6.16b, v16.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v2.4s, v6.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ rev32 v3.8h, v3.8h
+ add v2.4s, v2.4s, v5.4s
+ mov v5.s[1], v7.s[2]
+ add v4.4s, v4.4s, v3.4s
+ bsl v0.16b, v5.16b, v17.16b
+ eor v5.16b, v4.16b, v6.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v6.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v6.16b
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v18.4s, v26.4s, v18.4s
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v18.4s
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v5.16b, v5.16b, v6.16b
+ ext v3.16b, v3.16b, v3.16b, #8
+ add v2.4s, v5.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v0.4s, v2.4s, v0.4s
+ rev32 v2.8h, v3.8h
+ add v3.4s, v4.4s, v2.4s
+ eor v4.16b, v3.16b, v5.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ orr v4.16b, v4.16b, v5.16b
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ushr v5.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v2.16b, v2.16b, v5.16b
+ add v3.4s, v2.4s, v3.4s
+ eor v4.16b, v3.16b, v4.16b
+ ext v0.16b, v0.16b, v0.16b, #12
+ ushr v5.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v0.4s, v0.4s, v21.4s
+ orr v4.16b, v4.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #8
+ add v0.4s, v0.4s, v4.4s
+ eor v2.16b, v0.16b, v2.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ add v0.4s, v0.4s, v1.4s
+ rev32 v1.8h, v2.8h
+ add v2.4s, v3.4s, v1.4s
+ eor v3.16b, v2.16b, v4.16b
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v1.16b, v0.16b, v1.16b
+ ushr v4.4s, v1.4s, #8
+ shl v1.4s, v1.4s, #24
+ orr v1.16b, v1.16b, v4.16b
+ add v2.4s, v1.4s, v2.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v0.16b, v0.16b, v0.16b, #4
+ ext v1.16b, v1.16b, v1.16b, #8
+ ext v2.16b, v2.16b, v2.16b, #12
+ orr v3.16b, v3.16b, v4.16b
+ eor v0.16b, v2.16b, v0.16b
+ eor v3.16b, v3.16b, v1.16b
+ stp q0, q3, [x5]
+ ldr q0, [x0]
+ eor v0.16b, v0.16b, v2.16b
+ str q0, [x5, #32]
+ ldr q0, [x0, #16]
+ eor v0.16b, v0.16b, v1.16b
+ str q0, [x5, #48]
+ ret
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI2_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+ .text
+ .globl zfs_blake3_hash_many_sse2
+ .p2align 2
+ .type zfs_blake3_hash_many_sse2,@function
+zfs_blake3_hash_many_sse2:
+ .cfi_startproc
+ stp d15, d14, [sp, #-160]!
+ stp d13, d12, [sp, #16]
+ stp d11, d10, [sp, #32]
+ stp d9, d8, [sp, #48]
+ stp x29, x30, [sp, #64]
+ stp x28, x27, [sp, #80]
+ stp x26, x25, [sp, #96]
+ stp x24, x23, [sp, #112]
+ stp x22, x21, [sp, #128]
+ stp x20, x19, [sp, #144]
+ mov x29, sp
+ sub sp, sp, #384
+ .cfi_def_cfa w29, 160
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w21, -24
+ .cfi_offset w22, -32
+ .cfi_offset w23, -40
+ .cfi_offset w24, -48
+ .cfi_offset w25, -56
+ .cfi_offset w26, -64
+ .cfi_offset w27, -72
+ .cfi_offset w28, -80
+ .cfi_offset w30, -88
+ .cfi_offset w29, -96
+ .cfi_offset b8, -104
+ .cfi_offset b9, -112
+ .cfi_offset b10, -120
+ .cfi_offset b11, -128
+ .cfi_offset b12, -136
+ .cfi_offset b13, -144
+ .cfi_offset b14, -152
+ .cfi_offset b15, -160
+ ldr x26, [x29, #168]
+ ldrb w27, [x29, #160]
+ mov w19, w6
+ mov x20, x4
+ mov x22, x2
+ mov x28, x1
+ cmp x1, #4
+ mov x24, x0
+ str x3, [sp, #40]
+ b.lo .LBB2_8
+ adrp x9, .LCPI2_0
+ ldr q0, [x9, :lo12:.LCPI2_0]
+ sbfx w11, w5, #0, #1
+ dup v1.4s, w11
+ mov w9, #58983
+ mov w10, #44677
+ and v0.16b, v1.16b, v0.16b
+ mov w11, #62322
+ mov w12, #62778
+ orr w8, w7, w19
+ movk w9, #27145, lsl #16
+ movk w10, #47975, lsl #16
+ movk w11, #15470, lsl #16
+ str q0, [sp, #16]
+ orr v0.4s, #128, lsl #24
+ movk w12, #42319, lsl #16
+ str q0, [sp]
+.LBB2_2:
+ ldr x0, [sp, #40]
+ mov x13, x0
+ ld1r { v20.4s }, [x13], #4
+ add x14, x0, #8
+ add x15, x0, #12
+ add x16, x0, #16
+ add x17, x0, #20
+ add x18, x0, #24
+ add x0, x0, #28
+ ld1r { v17.4s }, [x14]
+ ld1r { v6.4s }, [x15]
+ ld1r { v8.4s }, [x16]
+ ld1r { v9.4s }, [x17]
+ ld1r { v31.4s }, [x18]
+ ld1r { v26.4s }, [x13]
+ ld1r { v15.4s }, [x0]
+ cbz x22, .LBB2_7
+ ldr q1, [sp, #16]
+ dup v0.4s, w20
+ ldp x13, x14, [x24]
+ ldp x15, x16, [x24, #16]
+ add v1.4s, v0.4s, v1.4s
+ movi v0.4s, #128, lsl #24
+ str q1, [sp, #64]
+ eor v0.16b, v1.16b, v0.16b
+ ldr q1, [sp]
+ lsr x18, x20, #32
+ mov x17, xzr
+ cmgt v0.4s, v1.4s, v0.4s
+ dup v1.4s, w18
+ sub v0.4s, v1.4s, v0.4s
+ mov w18, w8
+ str q0, [sp, #48]
+.LBB2_4:
+ mov w2, #16
+ bfi x2, x17, #6, #58
+ ldr q1, [x13, x2]
+ ldr q3, [x14, x2]
+ ldr q2, [x15, x2]
+ ldr q4, [x16, x2]
+ mov w2, #32
+ bfi x2, x17, #6, #58
+ ldr q5, [x13, x2]
+ ldr q18, [x14, x2]
+ ldr q19, [x15, x2]
+ ldr q23, [x16, x2]
+ mov w2, #48
+ lsl x3, x17, #6
+ bfi x2, x17, #6, #58
+ add x17, x17, #1
+ ldr q0, [x13, x3]
+ ldr q21, [x14, x3]
+ ldr q7, [x15, x3]
+ ldr q16, [x16, x3]
+ cmp x17, x22
+ ldr q13, [x13, x2]
+ ldr q14, [x14, x2]
+ ldr q29, [x15, x2]
+ ldr q10, [x16, x2]
+ csel w2, w27, wzr, eq
+ orr w18, w2, w18
+ mov x0, xzr
+ and w18, w18, #0xff
+ add x3, x3, #256
+.LBB2_5:
+ ldr x2, [x24, x0]
+ add x0, x0, #8
+ cmp x0, #32
+ add x2, x2, x3
+ prfm pldl1keep, [x2]
+ b.ne .LBB2_5
+ dup v22.4s, w18
+ str q22, [sp, #192]
+ zip1 v27.4s, v0.4s, v21.4s
+ zip2 v21.4s, v0.4s, v21.4s
+ zip1 v0.4s, v7.4s, v16.4s
+ zip2 v22.4s, v7.4s, v16.4s
+ zip1 v7.4s, v1.4s, v3.4s
+ zip1 v25.4s, v2.4s, v4.4s
+ zip2 v16.4s, v2.4s, v4.4s
+ zip1 v11.4s, v19.4s, v23.4s
+ zip2 v12.4s, v19.4s, v23.4s
+ zip1 v19.4s, v13.4s, v14.4s
+ zip2 v23.4s, v13.4s, v14.4s
+ zip1 v13.4s, v29.4s, v10.4s
+ zip2 v14.4s, v29.4s, v10.4s
+ add v10.4s, v20.4s, v8.4s
+ add v2.4s, v26.4s, v9.4s
+ ext v20.16b, v22.16b, v21.16b, #8
+ ext v26.16b, v25.16b, v7.16b, #8
+ zip2 v24.4s, v1.4s, v3.4s
+ add v1.4s, v6.4s, v15.4s
+ ext v6.16b, v0.16b, v27.16b, #8
+ ext v20.16b, v21.16b, v20.16b, #8
+ mov v21.d[1], v22.d[0]
+ ext v22.16b, v7.16b, v26.16b, #8
+ mov v7.d[1], v25.d[0]
+ add v3.4s, v17.4s, v31.4s
+ str q1, [sp, #144]
+ ext v1.16b, v27.16b, v6.16b, #8
+ mov v6.16b, v7.16b
+ zip1 v28.4s, v5.4s, v18.4s
+ stur q1, [x29, #-80]
+ mov v1.16b, v27.16b
+ mov v27.16b, v24.16b
+ add v3.4s, v3.4s, v6.4s
+ ldr q6, [sp, #64]
+ ext v29.16b, v16.16b, v24.16b, #8
+ mov v1.d[1], v0.d[0]
+ ext v0.16b, v11.16b, v28.16b, #8
+ mov v27.d[1], v16.d[0]
+ ext v16.16b, v14.16b, v23.16b, #8
+ stur q7, [x29, #-144]
+ ext v7.16b, v24.16b, v29.16b, #8
+ ext v29.16b, v28.16b, v0.16b, #8
+ ext v0.16b, v23.16b, v16.16b, #8
+ mov v23.d[1], v14.d[0]
+ stp q0, q23, [sp, #80]
+ add v0.4s, v10.4s, v1.4s
+ eor v16.16b, v0.16b, v6.16b
+ ldr q6, [sp, #48]
+ add v2.4s, v2.4s, v21.4s
+ mov v28.d[1], v11.d[0]
+ zip2 v18.4s, v5.4s, v18.4s
+ eor v10.16b, v2.16b, v6.16b
+ movi v6.4s, #64
+ eor v11.16b, v3.16b, v6.16b
+ ldr q6, [sp, #144]
+ dup v17.4s, w9
+ ext v30.16b, v12.16b, v18.16b, #8
+ rev32 v16.8h, v16.8h
+ dup v5.4s, w10
+ ext v25.16b, v18.16b, v30.16b, #8
+ mov v30.16b, v23.16b
+ mov v23.16b, v1.16b
+ str q1, [sp, #160]
+ rev32 v10.8h, v10.8h
+ add v1.4s, v16.4s, v17.4s
+ add v17.4s, v6.4s, v27.4s
+ ldr q6, [sp, #192]
+ dup v4.4s, w11
+ rev32 v11.8h, v11.8h
+ add v5.4s, v10.4s, v5.4s
+ eor v8.16b, v1.16b, v8.16b
+ stur q21, [x29, #-128]
+ mov v18.d[1], v12.d[0]
+ add v4.4s, v11.4s, v4.4s
+ eor v9.16b, v5.16b, v9.16b
+ ushr v12.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ ldur q21, [x29, #-80]
+ ext v26.16b, v13.16b, v19.16b, #8
+ eor v31.16b, v4.16b, v31.16b
+ orr v8.16b, v8.16b, v12.16b
+ ushr v12.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ ext v26.16b, v19.16b, v26.16b, #8
+ mov v19.d[1], v13.d[0]
+ orr v9.16b, v9.16b, v12.16b
+ ushr v12.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v17.16b, v6.16b
+ orr v31.16b, v31.16b, v12.16b
+ dup v12.4s, w12
+ rev32 v13.8h, v13.8h
+ add v12.4s, v13.4s, v12.4s
+ add v0.4s, v0.4s, v21.4s
+ eor v14.16b, v12.16b, v15.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v22.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v28.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v18.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v19.4s
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v30.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ mov v24.16b, v7.16b
+ stur q7, [x29, #-112]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ mov v7.16b, v26.16b
+ add v3.4s, v3.4s, v26.4s
+ ldr q26, [sp, #80]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ add v0.4s, v0.4s, v29.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v25.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v13.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ str q22, [sp, #128]
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ ldur q22, [x29, #-128]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ mov v6.16b, v18.16b
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ ldur q18, [x29, #-144]
+ orr v8.16b, v8.16b, v15.16b
+ add v0.4s, v0.4s, v22.4s
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v24.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v18.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v13.16b, v17.16b, v13.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v13.8h, v13.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v13.4s
+ add v0.4s, v0.4s, v27.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v6.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v23.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v21.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v19.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v29.4s
+ str q28, [sp, #112]
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldp q28, q23, [sp, #112]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ldr q21, [sp, #96]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ add v0.4s, v0.4s, v25.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v23.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v21.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v28.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v13.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ mov v30.16b, v29.16b
+ mov v29.16b, v25.16b
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ ldur q25, [x29, #-112]
+ orr v8.16b, v8.16b, v15.16b
+ add v0.4s, v0.4s, v20.4s
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v6.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v7.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v25.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v13.16b, v17.16b, v13.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v13.8h, v13.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v13.4s
+ add v0.4s, v0.4s, v18.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v8.4s
+ add v2.4s, v2.4s, v19.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v16.16b, v0.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v22.4s
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v21.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v14.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v13.16b, v17.16b, v13.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v13.4s, #8
+ shl v13.4s, v13.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v13.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v14.16b, v12.16b, v14.16b
+ add v0.4s, v0.4s, v27.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #7
+ shl v14.4s, v14.4s, #25
+ add v0.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v14.16b, v14.16b, v15.16b
+ eor v13.16b, v0.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v29.4s
+ rev32 v13.8h, v13.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v28.4s
+ add v4.4s, v4.4s, v13.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ ldr q24, [sp, #160]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v14.16b, v1.16b, v14.16b
+ add v5.4s, v5.4s, v11.4s
+ stur q7, [x29, #-64]
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v14.4s, #12
+ shl v14.4s, v14.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ mov v7.16b, v26.16b
+ add v3.4s, v3.4s, v26.4s
+ ldur q26, [x29, #-80]
+ orr v14.16b, v14.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ add v0.4s, v0.4s, v23.4s
+ orr v8.16b, v8.16b, v15.16b
+ add v15.4s, v0.4s, v9.4s
+ add v2.4s, v2.4s, v24.4s
+ eor v0.16b, v15.16b, v13.16b
+ add v2.4s, v2.4s, v31.4s
+ ushr v13.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v14.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v0.16b, v0.16b, v13.16b
+ ushr v13.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v13.16b
+ ushr v13.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v13.16b
+ ushr v13.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v13.16b
+ ushr v13.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ orr v9.16b, v9.16b, v13.16b
+ ushr v13.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ add v1.4s, v10.4s, v1.4s
+ orr v31.16b, v31.16b, v13.16b
+ eor v13.16b, v1.16b, v14.16b
+ add v5.4s, v11.4s, v5.4s
+ ushr v14.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v14.16b
+ ushr v14.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ stur q6, [x29, #-96]
+ orr v8.16b, v8.16b, v14.16b
+ add v14.4s, v15.4s, v6.4s
+ ldur q6, [x29, #-64]
+ mov v18.16b, v19.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v18.4s
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v21.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v6.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ str q27, [sp, #176]
+ mov v27.16b, v30.16b
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ add v14.4s, v14.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v27.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v20.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v7.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ mov v30.16b, v23.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v0.16b, v17.16b, v0.16b
+ add v1.4s, v16.4s, v1.4s
+ ldur q23, [x29, #-144]
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v0.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v23.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v29.4s
+ orr v13.16b, v13.16b, v15.16b
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v30.4s
+ rev32 v0.8h, v0.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ add v4.4s, v4.4s, v0.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldur q22, [x29, #-128]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ ldr q26, [sp, #176]
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v5.4s, v11.4s
+ add v14.4s, v14.4s, v24.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v22.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v28.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v15.16b
+ add v14.4s, v14.4s, v18.4s
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v27.4s
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v7.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v21.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ add v14.4s, v14.4s, v6.4s
+ ldur q6, [x29, #-96]
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ stur q20, [x29, #-160]
+ mov v20.16b, v29.16b
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ mov v19.16b, v29.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ mov v19.16b, v28.16b
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v6.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v19.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v0.16b, v17.16b, v0.16b
+ add v1.4s, v16.4s, v1.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v10.4s, v5.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v0.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v25.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v13.16b, v13.16b, v15.16b
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v24.4s
+ rev32 v0.8h, v0.8h
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v26.4s
+ mov v29.16b, v27.16b
+ add v4.4s, v4.4s, v0.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ ldur q27, [x29, #-160]
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v12.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v17.16b, v11.16b
+ ldur q6, [x29, #-80]
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v1.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v5.4s, v11.4s
+ add v14.4s, v14.4s, v22.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ add v14.4s, v14.4s, v9.4s
+ add v2.4s, v2.4s, v27.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v0.16b, v14.16b, v0.16b
+ add v2.4s, v2.4s, v31.4s
+ add v3.4s, v3.4s, v6.4s
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ eor v16.16b, v2.16b, v16.16b
+ add v3.4s, v3.4s, v13.4s
+ add v17.4s, v17.4s, v23.4s
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v3.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ eor v11.16b, v17.16b, v11.16b
+ add v4.4s, v0.4s, v4.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v11.4s, #8
+ shl v11.4s, v11.4s, #24
+ eor v9.16b, v4.16b, v9.16b
+ add v12.4s, v16.4s, v12.4s
+ orr v11.16b, v11.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v31.16b, v12.16b, v31.16b
+ add v1.4s, v10.4s, v1.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ eor v13.16b, v1.16b, v13.16b
+ add v5.4s, v11.4s, v5.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #7
+ shl v13.4s, v13.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v15.16b
+ add v14.4s, v14.4s, v29.4s
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v20.4s
+ mov v28.16b, v7.16b
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ add v3.4s, v3.4s, v19.4s
+ rev32 v16.8h, v16.8h
+ eor v10.16b, v2.16b, v10.16b
+ add v3.4s, v3.4s, v31.4s
+ add v17.4s, v17.4s, v28.4s
+ add v1.4s, v1.4s, v16.4s
+ rev32 v10.8h, v10.8h
+ eor v11.16b, v3.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ eor v8.16b, v1.16b, v8.16b
+ add v5.4s, v5.4s, v10.4s
+ rev32 v11.8h, v11.8h
+ eor v0.16b, v17.16b, v0.16b
+ ushr v15.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ eor v9.16b, v5.16b, v9.16b
+ add v4.4s, v4.4s, v11.4s
+ rev32 v0.8h, v0.8h
+ orr v8.16b, v8.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v31.16b, v4.16b, v31.16b
+ add v12.4s, v12.4s, v0.4s
+ add v14.4s, v14.4s, v21.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ eor v13.16b, v12.16b, v13.16b
+ add v14.4s, v14.4s, v8.4s
+ add v2.4s, v2.4s, v30.4s
+ orr v31.16b, v31.16b, v15.16b
+ ushr v15.4s, v13.4s, #12
+ shl v13.4s, v13.4s, #20
+ eor v16.16b, v14.16b, v16.16b
+ add v2.4s, v2.4s, v9.4s
+ orr v13.16b, v13.16b, v15.16b
+ ushr v15.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v10.16b, v2.16b, v10.16b
+ orr v16.16b, v16.16b, v15.16b
+ ushr v15.4s, v10.4s, #8
+ shl v10.4s, v10.4s, #24
+ add v3.4s, v3.4s, v18.4s
+ orr v10.16b, v10.16b, v15.16b
+ add v15.4s, v3.4s, v31.4s
+ eor v3.16b, v15.16b, v11.16b
+ ushr v11.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v11.16b, v3.16b, v11.16b
+ add v3.4s, v17.4s, v6.4s
+ add v17.4s, v3.4s, v13.4s
+ eor v0.16b, v17.16b, v0.16b
+ ushr v3.4s, v0.4s, #8
+ shl v0.4s, v0.4s, #24
+ add v1.4s, v16.4s, v1.4s
+ orr v0.16b, v0.16b, v3.16b
+ eor v3.16b, v1.16b, v8.16b
+ ushr v8.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ add v5.4s, v10.4s, v5.4s
+ orr v8.16b, v3.16b, v8.16b
+ eor v3.16b, v5.16b, v9.16b
+ add v4.4s, v11.4s, v4.4s
+ ushr v9.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ eor v31.16b, v4.16b, v31.16b
+ mov v7.16b, v23.16b
+ mov v23.16b, v28.16b
+ mov v28.16b, v6.16b
+ orr v3.16b, v3.16b, v9.16b
+ ushr v9.4s, v31.4s, #7
+ shl v31.4s, v31.4s, #25
+ ldur q6, [x29, #-64]
+ orr v31.16b, v31.16b, v9.16b
+ add v9.4s, v0.4s, v12.4s
+ eor v12.16b, v9.16b, v13.16b
+ ushr v13.4s, v12.4s, #7
+ shl v12.4s, v12.4s, #25
+ orr v12.16b, v12.16b, v13.16b
+ add v13.4s, v14.4s, v6.4s
+ add v13.4s, v13.4s, v3.4s
+ eor v0.16b, v13.16b, v0.16b
+ add v2.4s, v2.4s, v24.4s
+ rev32 v14.8h, v0.8h
+ add v0.4s, v2.4s, v31.4s
+ add v6.4s, v4.4s, v14.4s
+ eor v2.16b, v0.16b, v16.16b
+ eor v3.16b, v6.16b, v3.16b
+ rev32 v16.8h, v2.8h
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v2.4s, v9.4s, v16.4s
+ orr v4.16b, v3.16b, v4.16b
+ eor v3.16b, v2.16b, v31.16b
+ ushr v31.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v31.16b
+ add v31.4s, v15.4s, v22.4s
+ add v31.4s, v31.4s, v12.4s
+ add v17.4s, v17.4s, v7.4s
+ eor v9.16b, v31.16b, v10.16b
+ add v17.4s, v17.4s, v8.4s
+ rev32 v9.8h, v9.8h
+ eor v11.16b, v17.16b, v11.16b
+ add v1.4s, v1.4s, v9.4s
+ rev32 v11.8h, v11.8h
+ eor v10.16b, v1.16b, v12.16b
+ add v5.4s, v5.4s, v11.4s
+ ushr v12.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v8.16b, v5.16b, v8.16b
+ orr v10.16b, v10.16b, v12.16b
+ ushr v12.4s, v8.4s, #12
+ shl v8.4s, v8.4s, #20
+ orr v8.16b, v8.16b, v12.16b
+ add v12.4s, v13.4s, v27.4s
+ add v12.4s, v12.4s, v4.4s
+ eor v13.16b, v12.16b, v14.16b
+ ldur q14, [x29, #-96]
+ mov v25.16b, v29.16b
+ add v29.4s, v12.4s, v20.4s
+ add v20.4s, v31.4s, v26.4s
+ add v0.4s, v0.4s, v14.4s
+ add v0.4s, v0.4s, v3.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v0.4s, v0.4s, v30.4s
+ ldur q30, [x29, #-112]
+ add v20.4s, v20.4s, v10.4s
+ eor v31.16b, v20.16b, v9.16b
+ add v20.4s, v20.4s, v28.4s
+ add v17.4s, v17.4s, v30.4s
+ add v17.4s, v17.4s, v8.4s
+ eor v9.16b, v17.16b, v11.16b
+ ushr v28.4s, v13.4s, #8
+ shl v11.4s, v13.4s, #24
+ orr v28.16b, v11.16b, v28.16b
+ ushr v11.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ orr v16.16b, v16.16b, v11.16b
+ ushr v11.4s, v31.4s, #8
+ shl v31.4s, v31.4s, #24
+ add v6.4s, v28.4s, v6.4s
+ orr v31.16b, v31.16b, v11.16b
+ ushr v11.4s, v9.4s, #8
+ shl v9.4s, v9.4s, #24
+ add v2.4s, v16.4s, v2.4s
+ eor v4.16b, v6.16b, v4.16b
+ orr v9.16b, v9.16b, v11.16b
+ add v1.4s, v31.4s, v1.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v11.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ add v5.4s, v9.4s, v5.4s
+ eor v10.16b, v1.16b, v10.16b
+ orr v4.16b, v4.16b, v11.16b
+ ushr v11.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ eor v8.16b, v5.16b, v8.16b
+ orr v3.16b, v3.16b, v11.16b
+ ushr v11.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ orr v10.16b, v10.16b, v11.16b
+ ushr v11.4s, v8.4s, #7
+ shl v8.4s, v8.4s, #25
+ orr v8.16b, v8.16b, v11.16b
+ add v29.4s, v29.4s, v8.4s
+ eor v16.16b, v29.16b, v16.16b
+ add v0.4s, v0.4s, v4.4s
+ mov v12.16b, v26.16b
+ add v17.4s, v17.4s, v19.4s
+ add v26.4s, v29.4s, v23.4s
+ eor v29.16b, v0.16b, v31.16b
+ add v20.4s, v20.4s, v3.4s
+ rev32 v16.8h, v16.8h
+ stur q18, [x29, #-176]
+ mov v18.16b, v27.16b
+ add v0.4s, v0.4s, v24.4s
+ eor v27.16b, v20.16b, v9.16b
+ add v17.4s, v17.4s, v10.4s
+ rev32 v24.8h, v29.8h
+ add v1.4s, v1.4s, v16.4s
+ add v20.4s, v20.4s, v25.4s
+ eor v25.16b, v17.16b, v28.16b
+ rev32 v27.8h, v27.8h
+ add v5.4s, v5.4s, v24.4s
+ eor v28.16b, v1.16b, v8.16b
+ rev32 v25.8h, v25.8h
+ add v6.4s, v6.4s, v27.4s
+ eor v4.16b, v5.16b, v4.16b
+ ushr v31.4s, v28.4s, #12
+ shl v28.4s, v28.4s, #20
+ add v2.4s, v2.4s, v25.4s
+ eor v3.16b, v6.16b, v3.16b
+ orr v28.16b, v28.16b, v31.16b
+ ushr v31.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ eor v29.16b, v2.16b, v10.16b
+ orr v4.16b, v4.16b, v31.16b
+ ushr v31.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v26.4s, v26.4s, v28.4s
+ orr v3.16b, v3.16b, v31.16b
+ ushr v31.4s, v29.4s, #12
+ shl v29.4s, v29.4s, #20
+ eor v16.16b, v26.16b, v16.16b
+ add v0.4s, v0.4s, v4.4s
+ add v17.4s, v17.4s, v12.4s
+ orr v29.16b, v29.16b, v31.16b
+ eor v24.16b, v0.16b, v24.16b
+ add v0.4s, v0.4s, v22.4s
+ add v20.4s, v20.4s, v3.4s
+ ushr v22.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ add v23.4s, v26.4s, v21.4s
+ eor v21.16b, v20.16b, v27.16b
+ add v17.4s, v17.4s, v29.4s
+ orr v16.16b, v16.16b, v22.16b
+ ushr v22.4s, v24.4s, #8
+ shl v24.4s, v24.4s, #24
+ eor v25.16b, v17.16b, v25.16b
+ orr v22.16b, v24.16b, v22.16b
+ ushr v24.4s, v21.4s, #8
+ shl v21.4s, v21.4s, #24
+ orr v21.16b, v21.16b, v24.16b
+ ushr v24.4s, v25.4s, #8
+ shl v25.4s, v25.4s, #24
+ add v1.4s, v16.4s, v1.4s
+ orr v24.16b, v25.16b, v24.16b
+ add v5.4s, v22.4s, v5.4s
+ eor v25.16b, v1.16b, v28.16b
+ add v6.4s, v21.4s, v6.4s
+ eor v4.16b, v5.16b, v4.16b
+ ushr v27.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ add v2.4s, v24.4s, v2.4s
+ eor v3.16b, v6.16b, v3.16b
+ orr v25.16b, v25.16b, v27.16b
+ ushr v27.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ ldur q19, [x29, #-176]
+ eor v26.16b, v2.16b, v29.16b
+ orr v4.16b, v4.16b, v27.16b
+ ushr v27.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ orr v3.16b, v3.16b, v27.16b
+ ushr v27.4s, v26.4s, #7
+ shl v26.4s, v26.4s, #25
+ add v20.4s, v20.4s, v18.4s
+ add v17.4s, v17.4s, v30.4s
+ orr v26.16b, v26.16b, v27.16b
+ add v0.4s, v0.4s, v3.4s
+ eor v16.16b, v0.16b, v16.16b
+ add v0.4s, v0.4s, v19.4s
+ add v19.4s, v20.4s, v26.4s
+ add v17.4s, v17.4s, v25.4s
+ eor v20.16b, v19.16b, v22.16b
+ add v7.4s, v19.4s, v7.4s
+ eor v19.16b, v17.16b, v21.16b
+ ldur q21, [x29, #-64]
+ add v23.4s, v23.4s, v4.4s
+ eor v24.16b, v23.16b, v24.16b
+ rev32 v16.8h, v16.8h
+ add v17.4s, v17.4s, v21.4s
+ rev32 v21.8h, v24.8h
+ add v6.4s, v6.4s, v21.4s
+ rev32 v20.8h, v20.8h
+ add v2.4s, v2.4s, v16.4s
+ eor v4.16b, v6.16b, v4.16b
+ rev32 v19.8h, v19.8h
+ add v1.4s, v1.4s, v20.4s
+ eor v3.16b, v2.16b, v3.16b
+ ushr v24.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v5.4s, v5.4s, v19.4s
+ eor v22.16b, v1.16b, v26.16b
+ orr v4.16b, v4.16b, v24.16b
+ ushr v24.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v18.4s, v23.4s, v14.4s
+ eor v23.16b, v5.16b, v25.16b
+ orr v3.16b, v3.16b, v24.16b
+ ushr v24.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ orr v22.16b, v22.16b, v24.16b
+ ushr v24.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v24.16b
+ add v18.4s, v18.4s, v4.4s
+ add v0.4s, v0.4s, v3.4s
+ add v24.4s, v17.4s, v23.4s
+ eor v17.16b, v18.16b, v21.16b
+ add v7.4s, v7.4s, v22.4s
+ eor v16.16b, v0.16b, v16.16b
+ ushr v21.4s, v17.4s, #8
+ shl v17.4s, v17.4s, #24
+ eor v20.16b, v7.16b, v20.16b
+ orr v21.16b, v17.16b, v21.16b
+ ushr v17.4s, v16.4s, #8
+ shl v16.4s, v16.4s, #24
+ eor v19.16b, v24.16b, v19.16b
+ orr v16.16b, v16.16b, v17.16b
+ ushr v17.4s, v20.4s, #8
+ shl v20.4s, v20.4s, #24
+ orr v25.16b, v20.16b, v17.16b
+ ushr v17.4s, v19.4s, #8
+ shl v19.4s, v19.4s, #24
+ orr v19.16b, v19.16b, v17.16b
+ add v1.4s, v25.4s, v1.4s
+ eor v22.16b, v1.16b, v22.16b
+ eor v20.16b, v1.16b, v18.16b
+ add v1.4s, v19.4s, v5.4s
+ eor v26.16b, v1.16b, v0.16b
+ add v0.4s, v21.4s, v6.4s
+ eor v5.16b, v1.16b, v23.16b
+ eor v1.16b, v0.16b, v4.16b
+ eor v17.16b, v0.16b, v7.16b
+ add v0.4s, v16.4s, v2.4s
+ eor v2.16b, v0.16b, v3.16b
+ eor v6.16b, v0.16b, v24.16b
+ ushr v0.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v0.16b, v1.16b, v0.16b
+ ushr v1.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v2.16b, v1.16b
+ ushr v2.4s, v22.4s, #7
+ shl v3.4s, v22.4s, #25
+ orr v2.16b, v3.16b, v2.16b
+ ushr v3.4s, v5.4s, #7
+ shl v4.4s, v5.4s, #25
+ orr v3.16b, v4.16b, v3.16b
+ eor v8.16b, v16.16b, v3.16b
+ eor v9.16b, v25.16b, v0.16b
+ eor v31.16b, v1.16b, v19.16b
+ cmp x17, x22
+ eor v15.16b, v2.16b, v21.16b
+ mov w18, w19
+ b.ne .LBB2_4
+.LBB2_7:
+ zip1 v0.4s, v20.4s, v26.4s
+ zip2 v1.4s, v20.4s, v26.4s
+ zip1 v2.4s, v17.4s, v6.4s
+ zip2 v3.4s, v17.4s, v6.4s
+ zip1 v4.4s, v8.4s, v9.4s
+ zip2 v5.4s, v8.4s, v9.4s
+ zip1 v6.4s, v31.4s, v15.4s
+ zip2 v7.4s, v31.4s, v15.4s
+ add x13, x20, #4
+ tst w5, #0x1
+ sub x28, x28, #4
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v0.2d, v0.2d, v2.2d
+ zip1 v2.2d, v1.2d, v3.2d
+ zip2 v1.2d, v1.2d, v3.2d
+ zip1 v3.2d, v4.2d, v6.2d
+ zip2 v4.2d, v4.2d, v6.2d
+ zip1 v6.2d, v5.2d, v7.2d
+ zip2 v5.2d, v5.2d, v7.2d
+ add x24, x24, #32
+ csel x20, x13, x20, ne
+ cmp x28, #3
+ stp q16, q3, [x26]
+ stp q0, q4, [x26, #32]
+ stp q2, q6, [x26, #64]
+ stp q1, q5, [x26, #96]
+ add x26, x26, #128
+ b.hi .LBB2_2
+.LBB2_8:
+ cbz x28, .LBB2_16
+ orr w8, w7, w19
+ and x21, x5, #0x1
+ stur w8, [x29, #-64]
+.LBB2_10:
+ ldr x8, [sp, #40]
+ ldr x25, [x24]
+ ldur w4, [x29, #-64]
+ ldp q1, q0, [x8]
+ mov x8, x22
+ stp q1, q0, [x29, #-48]
+.LBB2_11:
+ subs x23, x8, #1
+ b.eq .LBB2_13
+ cbnz x8, .LBB2_14
+ b .LBB2_15
+.LBB2_13:
+ orr w4, w4, w27
+.LBB2_14:
+ sub x0, x29, #48
+ mov w2, #64
+ mov x1, x25
+ mov x3, x20
+ bl zfs_blake3_compress_in_place_sse2
+ add x25, x25, #64
+ mov x8, x23
+ mov w4, w19
+ b .LBB2_11
+.LBB2_15:
+ ldp q0, q1, [x29, #-48]
+ add x20, x20, x21
+ add x24, x24, #8
+ subs x28, x28, #1
+ stp q0, q1, [x26], #32
+ b.ne .LBB2_10
+.LBB2_16:
+ add sp, sp, #384
+ ldp x20, x19, [sp, #144]
+ ldp x22, x21, [sp, #128]
+ ldp x24, x23, [sp, #112]
+ ldp x26, x25, [sp, #96]
+ ldp x28, x27, [sp, #80]
+ ldp x29, x30, [sp, #64]
+ ldp d9, d8, [sp, #48]
+ ldp d11, d10, [sp, #32]
+ ldp d13, d12, [sp, #16]
+ ldp d15, d14, [sp], #160
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
new file mode 100644
index 000000000..eb6946400
--- /dev/null
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -0,0 +1,2463 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE4.1 -> ARMv8-A
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if defined(__aarch64__)
+ .text
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI0_1:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI0_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI0_3:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 20
+ .byte 21
+ .byte 22
+ .byte 23
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+.LCPI0_4:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 4
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+ .cfi_startproc
+ ldp q7, q6, [x0]
+ ldp q17, q18, [x1]
+ add x12, x1, #32
+ ld2 { v4.4s, v5.4s }, [x12]
+ lsr x10, x3, #32
+ fmov s16, w3
+ adrp x13, .LCPI0_0
+ adrp x11, .LCPI0_1
+ and w8, w2, #0xff
+ mov v16.s[1], w10
+ ldr q0, [x13, :lo12:.LCPI0_0]
+ ldr q20, [x11, :lo12:.LCPI0_1]
+ adrp x11, .LCPI0_4
+ and w9, w4, #0xff
+ ldr q2, [x11, :lo12:.LCPI0_4]
+ mov v16.s[2], w8
+ uzp1 v21.4s, v17.4s, v18.4s
+ add v7.4s, v6.4s, v7.4s
+ adrp x12, .LCPI0_3
+ mov v16.s[3], w9
+ uzp2 v18.4s, v17.4s, v18.4s
+ add v7.4s, v7.4s, v21.4s
+ ext v17.16b, v5.16b, v5.16b, #12
+ ldr q3, [x12, :lo12:.LCPI0_3]
+ ext v24.16b, v4.16b, v4.16b, #12
+ eor v16.16b, v7.16b, v16.16b
+ mov v27.16b, v17.16b
+ uzp1 v19.4s, v21.4s, v21.4s
+ ext v25.16b, v21.16b, v21.16b, #12
+ zip2 v28.4s, v18.4s, v17.4s
+ tbl v29.16b, { v16.16b }, v0.16b
+ mov v27.s[1], v24.s[2]
+ zip1 v23.2d, v17.2d, v18.2d
+ ext v19.16b, v19.16b, v21.16b, #8
+ add v22.4s, v29.4s, v20.4s
+ ext v26.16b, v21.16b, v25.16b, #12
+ tbl v20.16b, { v23.16b, v24.16b }, v2.16b
+ zip1 v21.4s, v28.4s, v24.4s
+ zip1 v23.4s, v24.4s, v28.4s
+ uzp2 v19.4s, v19.4s, v18.4s
+ eor v24.16b, v22.16b, v6.16b
+ ext v25.16b, v20.16b, v20.16b, #12
+ ext v6.16b, v23.16b, v21.16b, #8
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v16.16b, { v26.16b, v27.16b }, v3.16b
+ uzp1 v21.4s, v20.4s, v25.4s
+ mov v26.16b, v6.16b
+ ext v23.16b, v18.16b, v18.16b, #12
+ mov v26.s[1], v21.s[2]
+ adrp x10, .LCPI0_2
+ ext v25.16b, v18.16b, v23.16b, #12
+ uzp1 v23.4s, v18.4s, v18.4s
+ ldr q1, [x10, :lo12:.LCPI0_2]
+ ext v18.16b, v23.16b, v18.16b, #8
+ ushr v23.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ orr v23.16b, v24.16b, v23.16b
+ add v7.4s, v7.4s, v23.4s
+ eor v27.16b, v29.16b, v7.16b
+ add v4.4s, v7.4s, v4.4s
+ tbl v7.16b, { v25.16b, v26.16b }, v3.16b
+ tbl v26.16b, { v27.16b }, v1.16b
+ add v22.4s, v22.4s, v26.4s
+ uzp2 v18.4s, v18.4s, v16.4s
+ eor v23.16b, v23.16b, v22.16b
+ ext v5.16b, v18.16b, v18.16b, #4
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ uzp1 v25.4s, v5.4s, v5.4s
+ orr v23.16b, v23.16b, v27.16b
+ ext v28.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v25.16b, v5.16b, #8
+ ext v25.16b, v26.16b, v26.16b, #8
+ add v26.4s, v28.4s, v23.4s
+ eor v25.16b, v26.16b, v25.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v22.4s, v22.4s, v25.4s
+ eor v23.16b, v23.16b, v22.16b
+ add v17.4s, v26.4s, v17.4s
+ ushr v26.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v26.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v25.16b, v25.16b, v17.16b
+ add v17.4s, v17.4s, v19.4s
+ tbl v19.16b, { v25.16b }, v1.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ ext v17.16b, v17.16b, v17.16b, #4
+ orr v23.16b, v23.16b, v25.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v17.4s, v17.4s, v23.4s
+ eor v19.16b, v17.16b, v19.16b
+ ext v22.16b, v22.16b, v22.16b, #12
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v17.4s, v17.4s, v16.4s
+ orr v23.16b, v23.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ ext v25.16b, v17.16b, v17.16b, #12
+ eor v17.16b, v19.16b, v17.16b
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v19.4s, v22.4s, v17.4s
+ eor v22.16b, v23.16b, v19.16b
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v6.2d, v16.2d
+ ushr v23.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ zip2 v24.4s, v16.4s, v6.4s
+ tbl v26.16b, { v20.16b, v21.16b }, v2.16b
+ orr v22.16b, v22.16b, v23.16b
+ zip1 v16.4s, v24.4s, v21.4s
+ zip1 v20.4s, v21.4s, v24.4s
+ ext v21.16b, v26.16b, v26.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v25.4s, v25.4s, v22.4s
+ ext v16.16b, v20.16b, v16.16b, #8
+ uzp1 v21.4s, v26.4s, v21.4s
+ eor v26.16b, v25.16b, v17.16b
+ ext v19.16b, v19.16b, v19.16b, #4
+ tbl v26.16b, { v26.16b }, v0.16b
+ mov v29.16b, v16.16b
+ add v19.4s, v19.4s, v26.4s
+ ext v27.16b, v5.16b, v5.16b, #12
+ mov v29.s[1], v21.s[2]
+ eor v22.16b, v22.16b, v19.16b
+ ext v28.16b, v5.16b, v27.16b, #12
+ ushr v27.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v25.4s, v6.4s
+ orr v22.16b, v22.16b, v27.16b
+ add v6.4s, v6.4s, v22.4s
+ eor v26.16b, v26.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ tbl v18.16b, { v26.16b }, v1.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v22.16b, v22.16b, v26.16b
+ ext v18.16b, v18.16b, v18.16b, #8
+ add v6.4s, v6.4s, v22.4s
+ eor v18.16b, v6.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v6.4s, v7.4s
+ orr v22.16b, v22.16b, v26.16b
+ add v6.4s, v6.4s, v22.4s
+ ext v26.16b, v6.16b, v6.16b, #12
+ eor v6.16b, v18.16b, v6.16b
+ uzp2 v4.4s, v4.4s, v7.4s
+ zip2 v25.4s, v7.4s, v16.4s
+ add v26.4s, v26.4s, v21.4s
+ zip1 v20.2d, v16.2d, v7.2d
+ tbl v6.16b, { v6.16b }, v1.16b
+ ext v24.16b, v4.16b, v4.16b, #4
+ tbl v27.16b, { v20.16b, v21.16b }, v2.16b
+ zip1 v7.4s, v25.4s, v21.4s
+ zip1 v20.4s, v21.4s, v25.4s
+ add v18.4s, v19.4s, v6.4s
+ uzp1 v5.4s, v24.4s, v24.4s
+ ext v21.16b, v27.16b, v27.16b, #12
+ ext v7.16b, v20.16b, v7.16b, #8
+ eor v19.16b, v22.16b, v18.16b
+ ext v5.16b, v5.16b, v24.16b, #8
+ tbl v17.16b, { v28.16b, v29.16b }, v3.16b
+ uzp1 v21.4s, v27.4s, v21.4s
+ mov v28.16b, v7.16b
+ ushr v22.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v23.16b, v24.16b, v24.16b, #12
+ uzp2 v5.4s, v5.4s, v17.4s
+ mov v28.s[1], v21.s[2]
+ orr v19.16b, v19.16b, v22.16b
+ ext v27.16b, v24.16b, v23.16b, #12
+ ext v23.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v25.16b, v18.16b, v18.16b, #4
+ add v18.4s, v26.4s, v19.4s
+ uzp1 v24.4s, v23.4s, v23.4s
+ eor v6.16b, v18.16b, v6.16b
+ ext v24.16b, v24.16b, v23.16b, #8
+ add v16.4s, v18.4s, v16.4s
+ tbl v18.16b, { v27.16b, v28.16b }, v3.16b
+ tbl v27.16b, { v6.16b }, v0.16b
+ uzp2 v6.4s, v24.4s, v18.4s
+ add v24.4s, v25.4s, v27.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v25.16b
+ add v16.4s, v16.4s, v19.4s
+ eor v25.16b, v27.16b, v16.16b
+ add v4.4s, v16.4s, v4.4s
+ tbl v16.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v16.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v16.16b, v16.16b, v16.16b, #8
+ add v4.4s, v4.4s, v19.4s
+ eor v16.16b, v4.16b, v16.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v25.16b, { v16.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v16.16b, v19.16b, v24.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v17.4s
+ orr v19.16b, v16.16b, v19.16b
+ add v27.4s, v4.4s, v19.4s
+ eor v25.16b, v25.16b, v27.16b
+ tbl v25.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v25.4s
+ zip2 v26.4s, v17.4s, v7.4s
+ ext v4.16b, v27.16b, v27.16b, #12
+ eor v19.16b, v19.16b, v24.16b
+ add v28.4s, v4.4s, v21.4s
+ zip1 v20.2d, v7.2d, v17.2d
+ zip1 v4.4s, v26.4s, v21.4s
+ zip1 v17.4s, v21.4s, v26.4s
+ ushr v26.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v26.16b
+ ext v25.16b, v25.16b, v25.16b, #8
+ add v27.4s, v28.4s, v19.4s
+ eor v25.16b, v27.16b, v25.16b
+ ext v24.16b, v24.16b, v24.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v19.16b, v19.16b, v24.16b
+ add v7.4s, v27.4s, v7.4s
+ ushr v27.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v27.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v25.16b, v25.16b, v7.16b
+ add v5.4s, v7.4s, v5.4s
+ tbl v7.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v7.16b, v7.16b, v7.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ tbl v16.16b, { v20.16b, v21.16b }, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ orr v19.16b, v19.16b, v25.16b
+ ext v20.16b, v16.16b, v16.16b, #12
+ ext v4.16b, v17.16b, v4.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ uzp1 v21.4s, v16.4s, v20.4s
+ mov v17.16b, v4.16b
+ ext v25.16b, v5.16b, v5.16b, #12
+ mov v17.s[1], v21.s[2]
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v4.2d, v18.2d
+ ext v22.16b, v23.16b, v23.16b, #12
+ zip2 v26.4s, v18.4s, v4.4s
+ tbl v18.16b, { v20.16b, v21.16b }, v2.16b
+ eor v5.16b, v7.16b, v5.16b
+ ext v16.16b, v23.16b, v22.16b, #12
+ ext v22.16b, v6.16b, v6.16b, #4
+ zip1 v27.4s, v26.4s, v21.4s
+ zip1 v20.4s, v21.4s, v26.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ tbl v5.16b, { v5.16b }, v1.16b
+ ext v20.16b, v20.16b, v27.16b, #8
+ uzp1 v27.4s, v18.4s, v21.4s
+ uzp1 v18.4s, v22.4s, v22.4s
+ add v21.4s, v24.4s, v5.4s
+ ext v18.16b, v18.16b, v22.16b, #8
+ eor v19.16b, v19.16b, v21.16b
+ tbl v7.16b, { v16.16b, v17.16b }, v3.16b
+ uzp2 v18.4s, v18.4s, v17.4s
+ zip2 v16.4s, v16.4s, v20.4s
+ ushr v17.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v17.16b, v19.16b, v17.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v19.4s, v25.4s, v17.4s
+ eor v5.16b, v19.16b, v5.16b
+ ext v21.16b, v21.16b, v21.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v4.4s, v19.4s, v4.4s
+ add v19.4s, v21.4s, v5.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v21.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v17.16b, v17.16b, v21.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ add v6.4s, v19.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v17.16b, v19.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v7.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ mov v29.16b, v20.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ mov v29.s[1], v27.s[2]
+ add v4.4s, v4.4s, v27.4s
+ zip1 v26.2d, v20.2d, v7.2d
+ zip1 v7.4s, v16.4s, v27.4s
+ zip1 v16.4s, v27.4s, v16.4s
+ eor v17.16b, v17.16b, v6.16b
+ ext v7.16b, v16.16b, v7.16b, #8
+ ushr v16.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v16.16b, v17.16b, v16.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ ushr v17.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v20.4s
+ orr v16.16b, v16.16b, v17.16b
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v17.4s, v16.4s, #7
+ shl v16.4s, v16.4s, #25
+ ext v23.16b, v22.16b, v22.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v16.16b, v16.16b, v17.16b
+ ext v28.16b, v22.16b, v23.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v16.4s, v4.4s
+ tbl v3.16b, { v28.16b, v29.16b }, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v4.4s, v3.4s
+ tbl v4.16b, { v5.16b }, v0.16b
+ add v5.4s, v6.4s, v4.4s
+ eor v6.16b, v16.16b, v5.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v16.16b
+ tbl v2.16b, { v26.16b, v27.16b }, v2.16b
+ add v3.4s, v3.4s, v6.4s
+ ext v19.16b, v2.16b, v2.16b, #12
+ eor v4.16b, v4.16b, v3.16b
+ uzp1 v2.4s, v2.4s, v19.4s
+ ext v3.16b, v3.16b, v3.16b, #12
+ tbl v4.16b, { v4.16b }, v1.16b
+ add v2.4s, v3.4s, v2.4s
+ add v3.4s, v5.4s, v4.4s
+ eor v5.16b, v6.16b, v3.16b
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v6.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ add v2.4s, v2.4s, v5.4s
+ eor v4.16b, v2.16b, v4.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ tbl v0.16b, { v4.16b }, v0.16b
+ add v3.4s, v3.4s, v0.4s
+ eor v4.16b, v5.16b, v3.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v2.4s, v2.4s, v7.4s
+ orr v4.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v4.4s
+ eor v0.16b, v0.16b, v2.16b
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v3.4s, v0.4s
+ eor v3.16b, v4.16b, v1.16b
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v1.16b, v1.16b, v1.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v0.16b, v0.16b, v0.16b, #8
+ eor v1.16b, v2.16b, v1.16b
+ orr v2.16b, v3.16b, v4.16b
+ eor v0.16b, v2.16b, v0.16b
+ stp q1, q0, [x0]
+ ret
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI1_1:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
+.LCPI1_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI1_3:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 20
+ .byte 21
+ .byte 22
+ .byte 23
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+.LCPI1_4:
+ .byte 0
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 4
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 8
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 28
+ .byte 29
+ .byte 30
+ .byte 31
+ .text
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+ .cfi_startproc
+ ldp q7, q6, [x0]
+ ldp q17, q18, [x1]
+ add x12, x1, #32
+ ld2 { v4.4s, v5.4s }, [x12]
+ lsr x10, x3, #32
+ fmov s16, w3
+ adrp x13, .LCPI1_0
+ adrp x11, .LCPI1_1
+ and w8, w2, #0xff
+ mov v16.s[1], w10
+ ldr q0, [x13, :lo12:.LCPI1_0]
+ ldr q20, [x11, :lo12:.LCPI1_1]
+ adrp x11, .LCPI1_4
+ and w9, w4, #0xff
+ ldr q2, [x11, :lo12:.LCPI1_4]
+ mov v16.s[2], w8
+ uzp1 v21.4s, v17.4s, v18.4s
+ add v7.4s, v6.4s, v7.4s
+ adrp x12, .LCPI1_3
+ mov v16.s[3], w9
+ uzp2 v18.4s, v17.4s, v18.4s
+ add v7.4s, v7.4s, v21.4s
+ ext v17.16b, v5.16b, v5.16b, #12
+ ldr q3, [x12, :lo12:.LCPI1_3]
+ ext v24.16b, v4.16b, v4.16b, #12
+ eor v16.16b, v7.16b, v16.16b
+ mov v27.16b, v17.16b
+ uzp1 v19.4s, v21.4s, v21.4s
+ ext v25.16b, v21.16b, v21.16b, #12
+ zip2 v28.4s, v18.4s, v17.4s
+ tbl v29.16b, { v16.16b }, v0.16b
+ mov v27.s[1], v24.s[2]
+ zip1 v23.2d, v17.2d, v18.2d
+ ext v19.16b, v19.16b, v21.16b, #8
+ add v22.4s, v29.4s, v20.4s
+ ext v26.16b, v21.16b, v25.16b, #12
+ tbl v20.16b, { v23.16b, v24.16b }, v2.16b
+ zip1 v21.4s, v28.4s, v24.4s
+ zip1 v23.4s, v24.4s, v28.4s
+ uzp2 v19.4s, v19.4s, v18.4s
+ eor v24.16b, v22.16b, v6.16b
+ ext v25.16b, v20.16b, v20.16b, #12
+ ext v6.16b, v23.16b, v21.16b, #8
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v16.16b, { v26.16b, v27.16b }, v3.16b
+ uzp1 v21.4s, v20.4s, v25.4s
+ mov v26.16b, v6.16b
+ ext v23.16b, v18.16b, v18.16b, #12
+ mov v26.s[1], v21.s[2]
+ adrp x10, .LCPI1_2
+ ext v25.16b, v18.16b, v23.16b, #12
+ uzp1 v23.4s, v18.4s, v18.4s
+ ldr q1, [x10, :lo12:.LCPI1_2]
+ ext v18.16b, v23.16b, v18.16b, #8
+ ushr v23.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ orr v23.16b, v24.16b, v23.16b
+ add v7.4s, v7.4s, v23.4s
+ eor v27.16b, v29.16b, v7.16b
+ add v4.4s, v7.4s, v4.4s
+ tbl v7.16b, { v25.16b, v26.16b }, v3.16b
+ tbl v26.16b, { v27.16b }, v1.16b
+ add v22.4s, v22.4s, v26.4s
+ uzp2 v18.4s, v18.4s, v16.4s
+ eor v23.16b, v23.16b, v22.16b
+ ext v5.16b, v18.16b, v18.16b, #4
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ uzp1 v25.4s, v5.4s, v5.4s
+ orr v23.16b, v23.16b, v27.16b
+ ext v28.16b, v4.16b, v4.16b, #12
+ ext v4.16b, v25.16b, v5.16b, #8
+ ext v25.16b, v26.16b, v26.16b, #8
+ add v26.4s, v28.4s, v23.4s
+ eor v25.16b, v26.16b, v25.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v22.4s, v22.4s, v25.4s
+ eor v23.16b, v23.16b, v22.16b
+ add v17.4s, v26.4s, v17.4s
+ ushr v26.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v23.16b, v23.16b, v26.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v25.16b, v25.16b, v17.16b
+ add v17.4s, v17.4s, v19.4s
+ tbl v19.16b, { v25.16b }, v1.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ ext v17.16b, v17.16b, v17.16b, #4
+ orr v23.16b, v23.16b, v25.16b
+ ext v19.16b, v19.16b, v19.16b, #8
+ add v17.4s, v17.4s, v23.4s
+ eor v19.16b, v17.16b, v19.16b
+ ext v22.16b, v22.16b, v22.16b, #12
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v22.4s, v22.4s, v19.4s
+ eor v23.16b, v23.16b, v22.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v17.4s, v17.4s, v16.4s
+ orr v23.16b, v23.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ ext v25.16b, v17.16b, v17.16b, #12
+ eor v17.16b, v19.16b, v17.16b
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v19.4s, v22.4s, v17.4s
+ eor v22.16b, v23.16b, v19.16b
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v6.2d, v16.2d
+ ushr v23.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ zip2 v24.4s, v16.4s, v6.4s
+ tbl v26.16b, { v20.16b, v21.16b }, v2.16b
+ orr v22.16b, v22.16b, v23.16b
+ zip1 v16.4s, v24.4s, v21.4s
+ zip1 v20.4s, v21.4s, v24.4s
+ ext v21.16b, v26.16b, v26.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #8
+ add v25.4s, v25.4s, v22.4s
+ ext v16.16b, v20.16b, v16.16b, #8
+ uzp1 v21.4s, v26.4s, v21.4s
+ eor v26.16b, v25.16b, v17.16b
+ ext v19.16b, v19.16b, v19.16b, #4
+ tbl v26.16b, { v26.16b }, v0.16b
+ mov v29.16b, v16.16b
+ add v19.4s, v19.4s, v26.4s
+ ext v27.16b, v5.16b, v5.16b, #12
+ mov v29.s[1], v21.s[2]
+ eor v22.16b, v22.16b, v19.16b
+ ext v28.16b, v5.16b, v27.16b, #12
+ ushr v27.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v25.4s, v6.4s
+ orr v22.16b, v22.16b, v27.16b
+ add v6.4s, v6.4s, v22.4s
+ eor v26.16b, v26.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ tbl v18.16b, { v26.16b }, v1.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #7
+ shl v22.4s, v22.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v22.16b, v22.16b, v26.16b
+ ext v18.16b, v18.16b, v18.16b, #8
+ add v6.4s, v6.4s, v22.4s
+ eor v18.16b, v6.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v19.4s, v19.4s, v18.4s
+ eor v22.16b, v22.16b, v19.16b
+ ushr v26.4s, v22.4s, #12
+ shl v22.4s, v22.4s, #20
+ add v6.4s, v6.4s, v7.4s
+ orr v22.16b, v22.16b, v26.16b
+ add v6.4s, v6.4s, v22.4s
+ ext v26.16b, v6.16b, v6.16b, #12
+ eor v6.16b, v18.16b, v6.16b
+ uzp2 v4.4s, v4.4s, v7.4s
+ zip2 v25.4s, v7.4s, v16.4s
+ add v26.4s, v26.4s, v21.4s
+ zip1 v20.2d, v16.2d, v7.2d
+ tbl v6.16b, { v6.16b }, v1.16b
+ ext v24.16b, v4.16b, v4.16b, #4
+ tbl v27.16b, { v20.16b, v21.16b }, v2.16b
+ zip1 v7.4s, v25.4s, v21.4s
+ zip1 v20.4s, v21.4s, v25.4s
+ add v18.4s, v19.4s, v6.4s
+ uzp1 v5.4s, v24.4s, v24.4s
+ ext v21.16b, v27.16b, v27.16b, #12
+ ext v7.16b, v20.16b, v7.16b, #8
+ eor v19.16b, v22.16b, v18.16b
+ ext v5.16b, v5.16b, v24.16b, #8
+ tbl v17.16b, { v28.16b, v29.16b }, v3.16b
+ uzp1 v21.4s, v27.4s, v21.4s
+ mov v28.16b, v7.16b
+ ushr v22.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v23.16b, v24.16b, v24.16b, #12
+ uzp2 v5.4s, v5.4s, v17.4s
+ mov v28.s[1], v21.s[2]
+ orr v19.16b, v19.16b, v22.16b
+ ext v27.16b, v24.16b, v23.16b, #12
+ ext v23.16b, v5.16b, v5.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #8
+ ext v25.16b, v18.16b, v18.16b, #4
+ add v18.4s, v26.4s, v19.4s
+ uzp1 v24.4s, v23.4s, v23.4s
+ eor v6.16b, v18.16b, v6.16b
+ ext v24.16b, v24.16b, v23.16b, #8
+ add v16.4s, v18.4s, v16.4s
+ tbl v18.16b, { v27.16b, v28.16b }, v3.16b
+ tbl v27.16b, { v6.16b }, v0.16b
+ uzp2 v6.4s, v24.4s, v18.4s
+ add v24.4s, v25.4s, v27.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v25.16b
+ add v16.4s, v16.4s, v19.4s
+ eor v25.16b, v27.16b, v16.16b
+ add v4.4s, v16.4s, v4.4s
+ tbl v16.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v16.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v16.16b, v16.16b, v16.16b, #8
+ add v4.4s, v4.4s, v19.4s
+ eor v16.16b, v4.16b, v16.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v25.16b, { v16.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v16.16b, v19.16b, v24.16b
+ ushr v19.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v17.4s
+ orr v19.16b, v16.16b, v19.16b
+ add v27.4s, v4.4s, v19.4s
+ eor v25.16b, v25.16b, v27.16b
+ tbl v25.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v25.4s
+ zip2 v26.4s, v17.4s, v7.4s
+ ext v4.16b, v27.16b, v27.16b, #12
+ eor v19.16b, v19.16b, v24.16b
+ add v28.4s, v4.4s, v21.4s
+ zip1 v20.2d, v7.2d, v17.2d
+ zip1 v4.4s, v26.4s, v21.4s
+ zip1 v17.4s, v21.4s, v26.4s
+ ushr v26.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v19.16b, v19.16b, v26.16b
+ ext v25.16b, v25.16b, v25.16b, #8
+ add v27.4s, v28.4s, v19.4s
+ eor v25.16b, v27.16b, v25.16b
+ ext v24.16b, v24.16b, v24.16b, #4
+ tbl v25.16b, { v25.16b }, v0.16b
+ add v24.4s, v24.4s, v25.4s
+ eor v19.16b, v19.16b, v24.16b
+ add v7.4s, v27.4s, v7.4s
+ ushr v27.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v19.16b, v19.16b, v27.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v25.16b, v25.16b, v7.16b
+ add v5.4s, v7.4s, v5.4s
+ tbl v7.16b, { v25.16b }, v1.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v19.16b, v19.16b, v25.16b
+ ext v7.16b, v7.16b, v7.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v24.16b, v24.16b, v24.16b, #12
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v24.4s, v24.4s, v7.4s
+ eor v19.16b, v19.16b, v24.16b
+ ushr v25.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ tbl v16.16b, { v20.16b, v21.16b }, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ orr v19.16b, v19.16b, v25.16b
+ ext v20.16b, v16.16b, v16.16b, #12
+ ext v4.16b, v17.16b, v4.16b, #8
+ add v5.4s, v5.4s, v19.4s
+ uzp1 v21.4s, v16.4s, v20.4s
+ mov v17.16b, v4.16b
+ ext v25.16b, v5.16b, v5.16b, #12
+ mov v17.s[1], v21.s[2]
+ add v25.4s, v25.4s, v21.4s
+ zip1 v20.2d, v4.2d, v18.2d
+ ext v22.16b, v23.16b, v23.16b, #12
+ zip2 v26.4s, v18.4s, v4.4s
+ tbl v18.16b, { v20.16b, v21.16b }, v2.16b
+ eor v5.16b, v7.16b, v5.16b
+ ext v16.16b, v23.16b, v22.16b, #12
+ ext v22.16b, v6.16b, v6.16b, #4
+ zip1 v27.4s, v26.4s, v21.4s
+ zip1 v20.4s, v21.4s, v26.4s
+ ext v21.16b, v18.16b, v18.16b, #12
+ tbl v5.16b, { v5.16b }, v1.16b
+ ext v20.16b, v20.16b, v27.16b, #8
+ uzp1 v27.4s, v18.4s, v21.4s
+ uzp1 v18.4s, v22.4s, v22.4s
+ add v21.4s, v24.4s, v5.4s
+ ext v18.16b, v18.16b, v22.16b, #8
+ eor v19.16b, v19.16b, v21.16b
+ tbl v7.16b, { v16.16b, v17.16b }, v3.16b
+ uzp2 v18.4s, v18.4s, v17.4s
+ zip2 v16.4s, v16.4s, v20.4s
+ ushr v17.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ orr v17.16b, v19.16b, v17.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v19.4s, v25.4s, v17.4s
+ eor v5.16b, v19.16b, v5.16b
+ ext v21.16b, v21.16b, v21.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v4.4s, v19.4s, v4.4s
+ add v19.4s, v21.4s, v5.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v21.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v17.16b, v17.16b, v21.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ add v6.4s, v19.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v17.16b, v19.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v17.16b, v17.16b, v6.16b
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v7.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v4.4s, v4.4s, v17.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ mov v29.16b, v20.16b
+ ext v4.16b, v4.16b, v4.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ mov v29.s[1], v27.s[2]
+ add v4.4s, v4.4s, v27.4s
+ zip1 v26.2d, v20.2d, v7.2d
+ zip1 v7.4s, v16.4s, v27.4s
+ zip1 v16.4s, v27.4s, v16.4s
+ eor v17.16b, v17.16b, v6.16b
+ ext v7.16b, v16.16b, v7.16b, #8
+ ushr v16.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v16.16b, v17.16b, v16.16b
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #4
+ tbl v5.16b, { v5.16b }, v0.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ ushr v17.4s, v16.4s, #12
+ shl v16.4s, v16.4s, #20
+ add v4.4s, v4.4s, v20.4s
+ orr v16.16b, v16.16b, v17.16b
+ add v4.4s, v4.4s, v16.4s
+ eor v5.16b, v5.16b, v4.16b
+ tbl v5.16b, { v5.16b }, v1.16b
+ add v6.4s, v6.4s, v5.4s
+ eor v16.16b, v16.16b, v6.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v17.4s, v16.4s, #7
+ shl v16.4s, v16.4s, #25
+ ext v23.16b, v22.16b, v22.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v16.16b, v16.16b, v17.16b
+ ext v28.16b, v22.16b, v23.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ add v4.4s, v16.4s, v4.4s
+ tbl v3.16b, { v28.16b, v29.16b }, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ add v3.4s, v4.4s, v3.4s
+ tbl v4.16b, { v5.16b }, v0.16b
+ add v5.4s, v6.4s, v4.4s
+ eor v6.16b, v16.16b, v5.16b
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v16.16b
+ tbl v2.16b, { v26.16b, v27.16b }, v2.16b
+ add v3.4s, v3.4s, v6.4s
+ ext v19.16b, v2.16b, v2.16b, #12
+ eor v4.16b, v4.16b, v3.16b
+ uzp1 v2.4s, v2.4s, v19.4s
+ ext v3.16b, v3.16b, v3.16b, #12
+ tbl v4.16b, { v4.16b }, v1.16b
+ add v2.4s, v3.4s, v2.4s
+ add v3.4s, v5.4s, v4.4s
+ eor v5.16b, v6.16b, v3.16b
+ ushr v6.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v6.16b
+ ext v4.16b, v4.16b, v4.16b, #8
+ add v2.4s, v2.4s, v5.4s
+ eor v4.16b, v2.16b, v4.16b
+ ext v3.16b, v3.16b, v3.16b, #4
+ tbl v0.16b, { v4.16b }, v0.16b
+ add v3.4s, v3.4s, v0.4s
+ eor v4.16b, v5.16b, v3.16b
+ ushr v5.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v2.4s, v2.4s, v7.4s
+ orr v4.16b, v4.16b, v5.16b
+ add v2.4s, v2.4s, v4.4s
+ eor v0.16b, v0.16b, v2.16b
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v3.4s, v0.4s
+ eor v3.16b, v4.16b, v1.16b
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ ext v0.16b, v0.16b, v0.16b, #8
+ ext v1.16b, v1.16b, v1.16b, #12
+ orr v3.16b, v3.16b, v4.16b
+ eor v2.16b, v2.16b, v1.16b
+ eor v3.16b, v3.16b, v0.16b
+ stp q2, q3, [x5]
+ ldr q2, [x0]
+ eor v1.16b, v2.16b, v1.16b
+ str q1, [x5, #32]
+ ldr q1, [x0, #16]
+ eor v0.16b, v1.16b, v0.16b
+ str q0, [x5, #48]
+ ret
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI2_0:
+ .word 0
+ .word 1
+ .word 2
+ .word 3
+.LCPI2_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI2_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+ .text
+ .globl zfs_blake3_hash_many_sse41
+ .p2align 2
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
+ .cfi_startproc
+ stp d15, d14, [sp, #-160]!
+ stp d13, d12, [sp, #16]
+ stp d11, d10, [sp, #32]
+ stp d9, d8, [sp, #48]
+ stp x29, x30, [sp, #64]
+ stp x28, x27, [sp, #80]
+ stp x26, x25, [sp, #96]
+ stp x24, x23, [sp, #112]
+ stp x22, x21, [sp, #128]
+ stp x20, x19, [sp, #144]
+ mov x29, sp
+ sub sp, sp, #448
+ .cfi_def_cfa w29, 160
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w21, -24
+ .cfi_offset w22, -32
+ .cfi_offset w23, -40
+ .cfi_offset w24, -48
+ .cfi_offset w25, -56
+ .cfi_offset w26, -64
+ .cfi_offset w27, -72
+ .cfi_offset w28, -80
+ .cfi_offset w30, -88
+ .cfi_offset w29, -96
+ .cfi_offset b8, -104
+ .cfi_offset b9, -112
+ .cfi_offset b10, -120
+ .cfi_offset b11, -128
+ .cfi_offset b12, -136
+ .cfi_offset b13, -144
+ .cfi_offset b14, -152
+ .cfi_offset b15, -160
+ ldr x26, [x29, #168]
+ ldrb w27, [x29, #160]
+ mov w19, w6
+ mov x20, x4
+ mov x22, x2
+ mov x28, x1
+ cmp x1, #4
+ mov x24, x0
+ str x3, [sp, #40]
+ b.lo .LBB2_8
+ adrp x11, .LCPI2_0
+ ldr q0, [x11, :lo12:.LCPI2_0]
+ sbfx w13, w5, #0, #1
+ dup v1.4s, w13
+ mov w10, #58983
+ mov w11, #44677
+ mov w12, #62322
+ and v0.16b, v1.16b, v0.16b
+ mov w13, #62778
+ orr w8, w7, w19
+ adrp x9, .LCPI2_1
+ movk w10, #27145, lsl #16
+ movk w11, #47975, lsl #16
+ movk w12, #15470, lsl #16
+ movk w13, #42319, lsl #16
+ str q0, [sp, #16]
+ orr v0.4s, #128, lsl #24
+ adrp x14, .LCPI2_2
+ str q0, [sp]
+.LBB2_2:
+ ldr x2, [sp, #40]
+ mov x15, x2
+ ld1r { v7.4s }, [x15], #4
+ add x16, x2, #8
+ add x17, x2, #12
+ add x18, x2, #16
+ add x0, x2, #20
+ add x3, x2, #24
+ add x2, x2, #28
+ ld1r { v6.4s }, [x16]
+ ld1r { v17.4s }, [x17]
+ ld1r { v10.4s }, [x18]
+ ld1r { v11.4s }, [x0]
+ ld1r { v19.4s }, [x3]
+ ld1r { v18.4s }, [x15]
+ ld1r { v16.4s }, [x2]
+ cbz x22, .LBB2_7
+ ldr q1, [sp, #16]
+ dup v0.4s, w20
+ ldp x15, x16, [x24]
+ ldp x17, x18, [x24, #16]
+ add v1.4s, v0.4s, v1.4s
+ movi v0.4s, #128, lsl #24
+ str q1, [sp, #64]
+ eor v0.16b, v1.16b, v0.16b
+ ldr q1, [sp]
+ lsr x2, x20, #32
+ mov x0, xzr
+ mov w6, w8
+ cmgt v0.4s, v1.4s, v0.4s
+ dup v1.4s, w2
+ sub v0.4s, v1.4s, v0.4s
+ str q0, [sp, #48]
+.LBB2_4:
+ mov w4, #16
+ stp q16, q17, [sp, #192]
+ bfi x4, x0, #6, #58
+ ldr q1, [x15, x4]
+ ldr q3, [x16, x4]
+ ldr q2, [x17, x4]
+ ldr q4, [x18, x4]
+ mov w4, #32
+ bfi x4, x0, #6, #58
+ ldr q5, [x15, x4]
+ ldr q20, [x16, x4]
+ ldr q21, [x17, x4]
+ ldr q22, [x18, x4]
+ mov w4, #48
+ lsl x3, x0, #6
+ bfi x4, x0, #6, #58
+ add x0, x0, #1
+ ldr q0, [x15, x3]
+ ldr q23, [x16, x3]
+ ldr q16, [x17, x3]
+ ldr q17, [x18, x3]
+ cmp x0, x22
+ ldr q25, [x15, x4]
+ ldr q14, [x16, x4]
+ ldr q28, [x17, x4]
+ ldr q31, [x18, x4]
+ csel w4, w27, wzr, eq
+ orr w4, w4, w6
+ mov x2, xzr
+ and w6, w4, #0xff
+ add x3, x3, #256
+.LBB2_5:
+ ldr x4, [x24, x2]
+ add x2, x2, #8
+ cmp x2, #32
+ add x4, x4, x3
+ prfm pldl1keep, [x4]
+ b.ne .LBB2_5
+ zip1 v29.4s, v0.4s, v23.4s
+ zip2 v23.4s, v0.4s, v23.4s
+ zip1 v0.4s, v16.4s, v17.4s
+ zip2 v24.4s, v16.4s, v17.4s
+ zip1 v9.4s, v1.4s, v3.4s
+ zip2 v26.4s, v1.4s, v3.4s
+ zip1 v27.4s, v2.4s, v4.4s
+ zip2 v17.4s, v2.4s, v4.4s
+ zip1 v12.4s, v21.4s, v22.4s
+ zip2 v13.4s, v21.4s, v22.4s
+ add v2.4s, v7.4s, v10.4s
+ add v1.4s, v18.4s, v11.4s
+ ext v7.16b, v0.16b, v29.16b, #8
+ ext v22.16b, v24.16b, v23.16b, #8
+ zip1 v30.4s, v5.4s, v20.4s
+ zip2 v20.4s, v5.4s, v20.4s
+ stp q1, q2, [sp, #112]
+ ext v2.16b, v29.16b, v7.16b, #8
+ mov v29.d[1], v0.d[0]
+ ext v18.16b, v23.16b, v22.16b, #8
+ mov v23.d[1], v24.d[0]
+ zip1 v21.4s, v25.4s, v14.4s
+ zip2 v4.4s, v25.4s, v14.4s
+ zip1 v14.4s, v28.4s, v31.4s
+ zip2 v15.4s, v28.4s, v31.4s
+ add v8.4s, v6.4s, v19.4s
+ ext v28.16b, v27.16b, v9.16b, #8
+ ext v31.16b, v17.16b, v26.16b, #8
+ stur q2, [x29, #-208]
+ mov v7.16b, v29.16b
+ ext v0.16b, v12.16b, v30.16b, #8
+ stp q23, q29, [x29, #-80]
+ mov v2.16b, v19.16b
+ ext v19.16b, v13.16b, v20.16b, #8
+ mov v29.16b, v9.16b
+ ext v25.16b, v9.16b, v28.16b, #8
+ mov v29.d[1], v27.d[0]
+ ext v24.16b, v26.16b, v31.16b, #8
+ mov v26.d[1], v17.d[0]
+ ext v17.16b, v15.16b, v4.16b, #8
+ ext v27.16b, v30.16b, v0.16b, #8
+ ext v0.16b, v20.16b, v19.16b, #8
+ stp q0, q25, [sp, #80]
+ ext v0.16b, v4.16b, v17.16b, #8
+ str q0, [sp, #224]
+ ldr q0, [sp, #128]
+ mov v6.16b, v23.16b
+ mov v22.16b, v4.16b
+ ldr q16, [x9, :lo12:.LCPI2_1]
+ add v17.4s, v0.4s, v7.4s
+ ldr q0, [sp, #112]
+ mov v30.d[1], v12.d[0]
+ add v7.4s, v8.4s, v29.4s
+ mov v20.d[1], v13.d[0]
+ add v4.4s, v0.4s, v6.4s
+ ldr q0, [sp, #64]
+ dup v3.4s, w12
+ ext v28.16b, v14.16b, v21.16b, #8
+ dup v1.4s, w10
+ eor v19.16b, v17.16b, v0.16b
+ ldr q0, [sp, #48]
+ ext v23.16b, v21.16b, v28.16b, #8
+ mov v21.d[1], v14.d[0]
+ tbl v14.16b, { v19.16b }, v16.16b
+ eor v12.16b, v4.16b, v0.16b
+ movi v0.4s, #64
+ eor v13.16b, v7.16b, v0.16b
+ tbl v13.16b, { v13.16b }, v16.16b
+ add v6.4s, v13.4s, v3.4s
+ dup v5.4s, w11
+ tbl v12.16b, { v12.16b }, v16.16b
+ add v1.4s, v14.4s, v1.4s
+ eor v9.16b, v6.16b, v2.16b
+ ldp q2, q0, [sp, #192]
+ add v5.4s, v12.4s, v5.4s
+ eor v19.16b, v1.16b, v10.16b
+ eor v10.16b, v5.16b, v11.16b
+ ushr v11.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v11.16b, v19.16b, v11.16b
+ ushr v19.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ mov v22.d[1], v15.d[0]
+ orr v10.16b, v10.16b, v19.16b
+ ushr v19.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ add v15.4s, v0.4s, v2.4s
+ orr v9.16b, v9.16b, v19.16b
+ dup v19.4s, w6
+ add v15.4s, v15.4s, v26.4s
+ eor v19.16b, v15.16b, v19.16b
+ tbl v3.16b, { v19.16b }, v16.16b
+ dup v19.4s, w13
+ add v8.4s, v3.4s, v19.4s
+ ldur q31, [x29, #-208]
+ eor v19.16b, v8.16b, v2.16b
+ ushr v0.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v2.16b, v19.16b, v0.16b
+ ldr q19, [x14, :lo12:.LCPI2_2]
+ add v17.4s, v17.4s, v31.4s
+ add v17.4s, v17.4s, v11.4s
+ eor v14.16b, v14.16b, v17.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ add v1.4s, v1.4s, v14.4s
+ eor v11.16b, v1.16b, v11.16b
+ add v4.4s, v4.4s, v18.4s
+ ushr v0.4s, v11.4s, #7
+ shl v11.4s, v11.4s, #25
+ add v4.4s, v4.4s, v10.4s
+ orr v0.16b, v11.16b, v0.16b
+ eor v11.16b, v12.16b, v4.16b
+ tbl v11.16b, { v11.16b }, v19.16b
+ add v5.4s, v5.4s, v11.4s
+ eor v10.16b, v5.16b, v10.16b
+ add v7.4s, v7.4s, v25.4s
+ ushr v12.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ add v7.4s, v7.4s, v9.4s
+ orr v10.16b, v10.16b, v12.16b
+ eor v12.16b, v13.16b, v7.16b
+ tbl v12.16b, { v12.16b }, v19.16b
+ add v6.4s, v6.4s, v12.4s
+ eor v9.16b, v6.16b, v9.16b
+ ushr v13.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ orr v9.16b, v9.16b, v13.16b
+ add v13.4s, v15.4s, v24.4s
+ add v13.4s, v13.4s, v2.4s
+ eor v3.16b, v3.16b, v13.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v8.4s, v8.4s, v3.4s
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v30.4s
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v21.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v22.4s
+ mov v28.16b, v26.16b
+ stur q26, [x29, #-112]
+ mov v26.16b, v18.16b
+ mov v18.16b, v24.16b
+ stur q24, [x29, #-160]
+ add v6.4s, v6.4s, v3.4s
+ mov v24.16b, v20.16b
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q20, [sp, #80]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ stp q30, q22, [x29, #-192]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ mov v30.16b, v27.16b
+ add v17.4s, v17.4s, v27.4s
+ ldr q27, [sp, #224]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v23.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v27.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ stur q21, [x29, #-144]
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ ldur q21, [x29, #-80]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v21.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v26.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v18.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v29.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-64]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v28.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v23.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-144]
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v31.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v27.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q27, [sp, #96]
+ mov v21.16b, v26.16b
+ stur q26, [x29, #-96]
+ mov v28.16b, v31.16b
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldp q31, q26, [x29, #-192]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v20.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v27.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v26.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v31.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ mov v18.16b, v24.16b
+ mov v24.16b, v20.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ldur q20, [x29, #-160]
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v21.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v18.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v23.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v20.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q25, [x29, #-80]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v29.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v25.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v26.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ ldur q25, [x29, #-112]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v30.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v24.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v31.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q25, [x29, #-64]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldr q31, [sp, #224]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v27.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v25.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v31.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v28.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v26.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v23.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ mov v21.16b, v29.16b
+ stur q29, [x29, #-128]
+ mov v29.16b, v30.16b
+ mov v30.16b, v27.16b
+ mov v27.16b, v18.16b
+ str q18, [sp, #176]
+ eor v0.16b, v0.16b, v1.16b
+ mov v18.16b, v22.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ldur q22, [x29, #-96]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v20.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v29.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v22.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v31.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v21.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v28.4s
+ add v6.4s, v6.4s, v3.4s
+ mov v22.16b, v24.16b
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q24, [x29, #-80]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ mov v21.16b, v30.16b
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldur q30, [x29, #-192]
+ mov v20.16b, v29.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ ldur q29, [x29, #-112]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v25.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v24.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v30.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v29.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v20.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v31.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v26.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ add v17.4s, v17.4s, v23.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v27.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v30.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ ldur q27, [x29, #-160]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v27.4s
+ mov v28.16b, v25.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v21.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v28.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v29.4s
+ mov v25.16b, v31.16b
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ ldur q31, [x29, #-96]
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v13.16b
+ ldur q28, [x29, #-208]
+ mov v18.16b, v20.16b
+ str q20, [sp, #144]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ ldur q20, [x29, #-128]
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v24.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v31.4s
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v28.4s
+ orr v0.16b, v0.16b, v15.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v13.4s, v13.4s, v20.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v0.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v13.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v1.16b, v2.16b
+ add v5.4s, v5.4s, v12.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v2.16b, v2.16b, v15.16b
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v15.16b
+ add v17.4s, v17.4s, v18.4s
+ add v17.4s, v17.4s, v0.4s
+ add v4.4s, v4.4s, v22.4s
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v30.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v25.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v16.16b
+ eor v3.16b, v3.16b, v13.16b
+ add v17.4s, v17.4s, v26.4s
+ mov v26.16b, v21.16b
+ add v4.4s, v4.4s, v21.4s
+ ldur q21, [x29, #-144]
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v8.16b, v2.16b
+ add v17.4s, v17.4s, v0.4s
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v14.16b, v14.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v7.4s, v7.4s, v21.4s
+ orr v2.16b, v2.16b, v15.16b
+ tbl v14.16b, { v14.16b }, v19.16b
+ eor v11.16b, v11.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ add v13.4s, v13.4s, v28.4s
+ add v1.4s, v1.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v19.16b
+ eor v12.16b, v12.16b, v7.16b
+ add v13.4s, v13.4s, v2.4s
+ str q23, [sp, #160]
+ eor v0.16b, v0.16b, v1.16b
+ add v5.4s, v5.4s, v11.4s
+ tbl v12.16b, { v12.16b }, v19.16b
+ eor v3.16b, v3.16b, v13.16b
+ add v17.4s, v17.4s, v23.4s
+ ldur q23, [x29, #-64]
+ ushr v15.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v10.16b, v5.16b, v10.16b
+ add v6.4s, v6.4s, v12.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ orr v0.16b, v0.16b, v15.16b
+ ushr v15.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ eor v9.16b, v6.16b, v9.16b
+ add v8.4s, v8.4s, v3.4s
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v2.16b, v8.16b, v2.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v23.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v24.4s
+ tbl v3.16b, { v3.16b }, v16.16b
+ eor v14.16b, v14.16b, v4.16b
+ add v7.4s, v7.4s, v2.4s
+ add v6.4s, v6.4s, v3.4s
+ tbl v14.16b, { v14.16b }, v16.16b
+ eor v11.16b, v11.16b, v7.16b
+ add v13.4s, v13.4s, v20.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v8.4s, v8.4s, v14.4s
+ tbl v11.16b, { v11.16b }, v16.16b
+ add v13.4s, v13.4s, v0.4s
+ ldr q20, [sp, #176]
+ ushr v15.4s, v10.4s, #12
+ shl v10.4s, v10.4s, #20
+ eor v9.16b, v8.16b, v9.16b
+ add v1.4s, v1.4s, v11.4s
+ eor v12.16b, v12.16b, v13.16b
+ orr v10.16b, v10.16b, v15.16b
+ ushr v15.4s, v9.4s, #12
+ shl v9.4s, v9.4s, #20
+ eor v2.16b, v1.16b, v2.16b
+ tbl v12.16b, { v12.16b }, v16.16b
+ orr v9.16b, v9.16b, v15.16b
+ ushr v15.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v5.4s, v5.4s, v12.4s
+ add v17.4s, v17.4s, v31.4s
+ orr v2.16b, v2.16b, v15.16b
+ eor v0.16b, v5.16b, v0.16b
+ add v17.4s, v17.4s, v10.4s
+ add v4.4s, v4.4s, v20.4s
+ add v7.4s, v7.4s, v29.4s
+ ushr v15.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v3.16b, v3.16b, v17.16b
+ add v4.4s, v4.4s, v9.4s
+ add v7.4s, v7.4s, v2.4s
+ orr v0.16b, v0.16b, v15.16b
+ mov v15.16b, v31.16b
+ add v17.4s, v17.4s, v22.4s
+ eor v31.16b, v14.16b, v4.16b
+ eor v22.16b, v11.16b, v7.16b
+ add v11.4s, v13.4s, v27.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v11.4s, v11.4s, v0.4s
+ tbl v31.16b, { v31.16b }, v19.16b
+ add v6.4s, v6.4s, v3.4s
+ eor v12.16b, v12.16b, v11.16b
+ tbl v22.16b, { v22.16b }, v19.16b
+ add v8.4s, v8.4s, v31.4s
+ eor v10.16b, v6.16b, v10.16b
+ add v30.4s, v11.4s, v30.4s
+ tbl v11.16b, { v12.16b }, v19.16b
+ add v1.4s, v1.4s, v22.4s
+ eor v9.16b, v8.16b, v9.16b
+ ushr v12.4s, v10.4s, #7
+ shl v10.4s, v10.4s, #25
+ add v5.4s, v5.4s, v11.4s
+ eor v2.16b, v1.16b, v2.16b
+ orr v10.16b, v10.16b, v12.16b
+ ushr v12.4s, v9.4s, #7
+ shl v9.4s, v9.4s, #25
+ eor v0.16b, v5.16b, v0.16b
+ orr v9.16b, v9.16b, v12.16b
+ ushr v12.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v2.16b, v2.16b, v12.16b
+ ushr v12.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v12.16b
+ add v4.4s, v4.4s, v26.4s
+ add v17.4s, v17.4s, v0.4s
+ add v7.4s, v7.4s, v28.4s
+ mov v18.16b, v27.16b
+ eor v31.16b, v31.16b, v17.16b
+ add v4.4s, v4.4s, v10.4s
+ add v27.4s, v30.4s, v2.4s
+ eor v22.16b, v22.16b, v4.16b
+ add v7.4s, v7.4s, v9.4s
+ eor v3.16b, v3.16b, v27.16b
+ add v26.4s, v27.4s, v29.4s
+ tbl v27.16b, { v31.16b }, v16.16b
+ eor v28.16b, v11.16b, v7.16b
+ tbl v22.16b, { v22.16b }, v16.16b
+ add v1.4s, v1.4s, v27.4s
+ add v4.4s, v4.4s, v23.4s
+ ldr q23, [sp, #144]
+ tbl v28.16b, { v28.16b }, v16.16b
+ tbl v3.16b, { v3.16b }, v16.16b
+ add v5.4s, v5.4s, v22.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v6.4s, v6.4s, v28.4s
+ add v29.4s, v8.4s, v3.4s
+ eor v30.16b, v5.16b, v10.16b
+ ushr v8.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v31.16b, v6.16b, v9.16b
+ orr v0.16b, v0.16b, v8.16b
+ ushr v8.4s, v30.4s, #12
+ shl v30.4s, v30.4s, #20
+ eor v2.16b, v29.16b, v2.16b
+ orr v30.16b, v30.16b, v8.16b
+ ushr v8.4s, v31.4s, #12
+ shl v31.4s, v31.4s, #20
+ add v17.4s, v17.4s, v25.4s
+ add v7.4s, v7.4s, v23.4s
+ orr v31.16b, v31.16b, v8.16b
+ ushr v8.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ldur q23, [x29, #-176]
+ orr v2.16b, v2.16b, v8.16b
+ add v17.4s, v17.4s, v0.4s
+ eor v27.16b, v27.16b, v17.16b
+ add v4.4s, v4.4s, v30.4s
+ add v25.4s, v26.4s, v2.4s
+ eor v22.16b, v22.16b, v4.16b
+ add v4.4s, v4.4s, v24.4s
+ add v7.4s, v7.4s, v31.4s
+ eor v3.16b, v3.16b, v25.16b
+ add v24.4s, v25.4s, v18.4s
+ tbl v25.16b, { v27.16b }, v19.16b
+ add v17.4s, v17.4s, v23.4s
+ eor v23.16b, v28.16b, v7.16b
+ tbl v22.16b, { v22.16b }, v19.16b
+ add v1.4s, v1.4s, v25.4s
+ tbl v23.16b, { v23.16b }, v19.16b
+ tbl v3.16b, { v3.16b }, v19.16b
+ add v5.4s, v5.4s, v22.4s
+ eor v0.16b, v0.16b, v1.16b
+ add v6.4s, v6.4s, v23.4s
+ add v26.4s, v29.4s, v3.4s
+ eor v27.16b, v5.16b, v30.16b
+ ushr v29.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v28.16b, v6.16b, v31.16b
+ orr v0.16b, v0.16b, v29.16b
+ ushr v29.4s, v27.4s, #7
+ shl v27.4s, v27.4s, #25
+ eor v2.16b, v26.16b, v2.16b
+ orr v27.16b, v27.16b, v29.16b
+ ushr v29.4s, v28.4s, #7
+ shl v28.4s, v28.4s, #25
+ ldur q18, [x29, #-128]
+ orr v28.16b, v28.16b, v29.16b
+ ushr v29.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v7.4s, v7.4s, v15.4s
+ orr v2.16b, v2.16b, v29.16b
+ add v17.4s, v17.4s, v27.4s
+ add v4.4s, v4.4s, v28.4s
+ add v7.4s, v7.4s, v2.4s
+ eor v3.16b, v3.16b, v17.16b
+ add v17.4s, v17.4s, v20.4s
+ eor v20.16b, v25.16b, v4.16b
+ add v4.4s, v4.4s, v21.4s
+ eor v21.16b, v22.16b, v7.16b
+ add v7.4s, v7.4s, v18.4s
+ add v18.4s, v24.4s, v0.4s
+ eor v22.16b, v23.16b, v18.16b
+ ldr q23, [sp, #160]
+ tbl v3.16b, { v3.16b }, v16.16b
+ tbl v20.16b, { v20.16b }, v16.16b
+ add v6.4s, v6.4s, v3.4s
+ add v18.4s, v18.4s, v23.4s
+ tbl v21.16b, { v21.16b }, v16.16b
+ tbl v16.16b, { v22.16b }, v16.16b
+ add v22.4s, v26.4s, v20.4s
+ eor v23.16b, v6.16b, v27.16b
+ add v1.4s, v1.4s, v21.4s
+ eor v24.16b, v22.16b, v28.16b
+ ushr v25.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ add v5.4s, v5.4s, v16.4s
+ eor v2.16b, v1.16b, v2.16b
+ orr v23.16b, v23.16b, v25.16b
+ ushr v25.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ eor v0.16b, v5.16b, v0.16b
+ orr v24.16b, v24.16b, v25.16b
+ ushr v25.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v2.16b, v2.16b, v25.16b
+ ushr v25.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ orr v0.16b, v0.16b, v25.16b
+ add v25.4s, v7.4s, v2.4s
+ add v26.4s, v18.4s, v0.4s
+ eor v18.16b, v21.16b, v25.16b
+ add v17.4s, v17.4s, v23.4s
+ add v4.4s, v4.4s, v24.4s
+ eor v16.16b, v16.16b, v26.16b
+ tbl v21.16b, { v18.16b }, v19.16b
+ eor v3.16b, v3.16b, v17.16b
+ eor v7.16b, v20.16b, v4.16b
+ tbl v16.16b, { v16.16b }, v19.16b
+ add v1.4s, v1.4s, v21.4s
+ tbl v3.16b, { v3.16b }, v19.16b
+ tbl v20.16b, { v7.16b }, v19.16b
+ eor v2.16b, v1.16b, v2.16b
+ eor v7.16b, v1.16b, v17.16b
+ add v1.4s, v5.4s, v16.4s
+ eor v0.16b, v1.16b, v0.16b
+ eor v18.16b, v1.16b, v4.16b
+ add v1.4s, v6.4s, v3.4s
+ eor v4.16b, v1.16b, v23.16b
+ eor v6.16b, v25.16b, v1.16b
+ add v1.4s, v22.4s, v20.4s
+ eor v5.16b, v1.16b, v24.16b
+ eor v17.16b, v26.16b, v1.16b
+ ushr v1.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ orr v1.16b, v4.16b, v1.16b
+ ushr v4.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v4.16b, v5.16b, v4.16b
+ ushr v5.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v2.16b, v2.16b, v5.16b
+ ushr v5.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ orr v0.16b, v0.16b, v5.16b
+ eor v10.16b, v0.16b, v20.16b
+ eor v11.16b, v1.16b, v21.16b
+ eor v19.16b, v4.16b, v16.16b
+ cmp x0, x22
+ eor v16.16b, v2.16b, v3.16b
+ mov w6, w19
+ b.ne .LBB2_4
+.LBB2_7:
+ zip1 v0.4s, v7.4s, v18.4s
+ zip2 v1.4s, v7.4s, v18.4s
+ zip1 v2.4s, v6.4s, v17.4s
+ zip2 v3.4s, v6.4s, v17.4s
+ zip1 v4.4s, v10.4s, v11.4s
+ zip2 v5.4s, v10.4s, v11.4s
+ zip1 v6.4s, v19.4s, v16.4s
+ zip2 v7.4s, v19.4s, v16.4s
+ add x15, x20, #4
+ tst w5, #0x1
+ sub x28, x28, #4
+ zip1 v16.2d, v0.2d, v2.2d
+ zip2 v0.2d, v0.2d, v2.2d
+ zip1 v2.2d, v1.2d, v3.2d
+ zip2 v1.2d, v1.2d, v3.2d
+ zip1 v3.2d, v4.2d, v6.2d
+ zip2 v4.2d, v4.2d, v6.2d
+ zip1 v6.2d, v5.2d, v7.2d
+ zip2 v5.2d, v5.2d, v7.2d
+ add x24, x24, #32
+ csel x20, x15, x20, ne
+ cmp x28, #3
+ stp q16, q3, [x26]
+ stp q0, q4, [x26, #32]
+ stp q2, q6, [x26, #64]
+ stp q1, q5, [x26, #96]
+ add x26, x26, #128
+ b.hi .LBB2_2
+.LBB2_8:
+ cbz x28, .LBB2_16
+ orr w8, w7, w19
+ and x21, x5, #0x1
+ stur w8, [x29, #-64]
+.LBB2_10:
+ ldr x8, [sp, #40]
+ ldr x25, [x24]
+ ldur w4, [x29, #-64]
+ ldp q1, q0, [x8]
+ mov x8, x22
+ stp q1, q0, [x29, #-48]
+.LBB2_11:
+ subs x23, x8, #1
+ b.eq .LBB2_13
+ cbnz x8, .LBB2_14
+ b .LBB2_15
+.LBB2_13:
+ orr w4, w4, w27
+.LBB2_14:
+ sub x0, x29, #48
+ mov w2, #64
+ mov x1, x25
+ mov x3, x20
+ bl zfs_blake3_compress_in_place_sse41
+ add x25, x25, #64
+ mov x8, x23
+ mov w4, w19
+ b .LBB2_11
+.LBB2_15:
+ ldp q0, q1, [x29, #-48]
+ add x20, x20, x21
+ add x24, x24, #8
+ subs x28, x28, #1
+ stp q0, q1, [x26], #32
+ b.ne .LBB2_10
+.LBB2_16:
+ add sp, sp, #448
+ ldp x20, x19, [sp, #144]
+ ldp x22, x21, [sp, #128]
+ ldp x24, x23, [sp, #112]
+ ldp x26, x25, [sp, #96]
+ ldp x28, x27, [sp, #80]
+ ldp x29, x30, [sp, #64]
+ ldp d9, d8, [sp, #48]
+ ldp d11, d10, [sp, #32]
+ ldp d13, d12, [sp, #16]
+ ldp d15, d14, [sp], #160
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S
new file mode 100644
index 000000000..9deba202f
--- /dev/null
+++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S
@@ -0,0 +1,2823 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE2 -> POWER8 PPC64 Little Endian
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ .text
+ .abiversion 2
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI0_1:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI0_2:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_3:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_4:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_5:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_6:
+ .short 1
+ .short 2
+ .short 4
+ .short 8
+ .short 16
+ .short 32
+ .short 64
+ .short 128
+.LCPI0_7:
+ .short 0
+ .short 0
+ .short 4
+ .short 8
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI0_8:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_9:
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI0_10:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_12:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_13:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_14:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .text
+ .globl zfs_blake3_compress_in_place_sse2
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse2,@function
+zfs_blake3_compress_in_place_sse2:
+.Lfunc_begin0:
+ .cfi_startproc
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry zfs_blake3_compress_in_place_sse2, .Lfunc_lep0-.Lfunc_gep0
+ li 8, -64
+ mtvsrd 35, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 12, 9
+ stxvd2x 60, 1, 8
+ li 8, -48
+ mtvsrd 36, 7
+ lfd 2, 16(4)
+ stxvd2x 61, 1, 8
+ li 8, -32
+ lfd 1, 8(4)
+ mtvsrwz 37, 6
+ rldicl 6, 6, 32, 32
+ addis 7, 2, .LCPI0_2@toc@ha
+ stxvd2x 62, 1, 8
+ li 8, -16
+ addi 7, 7, .LCPI0_2@toc@l
+ stxvd2x 63, 1, 8
+ li 8, 0
+ lvx 9, 0, 7
+ li 7, 48
+ mtvsrd 34, 8
+ xxmrghd 32, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ lfd 3, 24(4)
+ addis 8, 2, .LCPI0_5@toc@ha
+ vmrghb 3, 2, 3
+ addi 8, 8, .LCPI0_5@toc@l
+ vmrghb 4, 2, 4
+ vspltb 2, 2, 7
+ xxmrghd 33, 3, 2
+ vpkudum 7, 1, 0
+ vmrglh 3, 2, 3
+ vmrglh 2, 2, 4
+ mtvsrwz 36, 6
+ addis 6, 2, .LCPI0_0@toc@ha
+ addi 6, 6, .LCPI0_0@toc@l
+ vperm 10, 1, 0, 9
+ vmrghw 4, 4, 5
+ xxswapd 37, 1
+ lxvd2x 1, 4, 7
+ addis 7, 2, .LCPI0_8@toc@ha
+ addi 7, 7, .LCPI0_8@toc@l
+ vmrglw 2, 2, 3
+ xxswapd 35, 0
+ xxswapd 41, 1
+ xxspltd 62, 42, 1
+ vadduwm 3, 7, 3
+ vadduwm 6, 3, 5
+ xxmrgld 36, 34, 36
+ lvx 2, 0, 6
+ addis 6, 2, .LCPI0_1@toc@ha
+ addi 6, 6, .LCPI0_1@toc@l
+ xxlxor 35, 38, 36
+ lvx 4, 0, 6
+ li 6, 32
+ lxvd2x 0, 4, 6
+ addis 4, 2, .LCPI0_3@toc@ha
+ addis 6, 2, .LCPI0_7@toc@ha
+ vperm 8, 3, 3, 2
+ vspltisw 3, 10
+ addi 4, 4, .LCPI0_3@toc@l
+ addi 6, 6, .LCPI0_7@toc@l
+ vadduwm 3, 3, 3
+ vadduwm 11, 8, 4
+ xxlxor 36, 43, 37
+ vadduwm 5, 6, 10
+ vrlw 0, 4, 3
+ vspltisw 4, 12
+ vadduwm 4, 4, 4
+ vadduwm 1, 0, 5
+ xxlxor 37, 33, 40
+ xxswapd 40, 0
+ vrlw 6, 5, 4
+ vspltisw 5, -16
+ vpkudum 13, 9, 8
+ vsubuwm 5, 12, 5
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI0_4@toc@ha
+ addi 4, 4, .LCPI0_4@toc@l
+ vadduwm 11, 6, 11
+ xxswapd 0, 38
+ vadduwm 1, 1, 13
+ xxsldwi 50, 45, 45, 1
+ xxlxor 32, 43, 32
+ xxsldwi 43, 43, 43, 3
+ xxsldwi 33, 33, 33, 1
+ vperm 12, 8, 9, 12
+ vrlw 0, 0, 5
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 0
+ vadduwm 1, 1, 12
+ vperm 6, 6, 6, 2
+ vadduwm 15, 6, 11
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI0_6@toc@ha
+ addi 4, 4, .LCPI0_6@toc@l
+ xxlxor 32, 47, 32
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI0_9@toc@ha
+ vperm 14, 10, 7, 11
+ addi 4, 4, .LCPI0_9@toc@l
+ vrlw 0, 0, 3
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 38
+ vrlw 6, 6, 4
+ vadduwm 8, 6, 15
+ xxswapd 0, 38
+ lvx 6, 0, 8
+ xxlxor 32, 40, 32
+ xxsldwi 40, 40, 40, 1
+ vperm 13, 12, 18, 6
+ vrlw 9, 0, 5
+ vadduwm 0, 1, 14
+ lvx 1, 0, 7
+ xxsldwi 46, 46, 46, 3
+ xxsldwi 32, 32, 32, 3
+ vperm 7, 7, 7, 1
+ vadduwm 15, 9, 0
+ xxlxor 32, 47, 0
+ vperm 16, 0, 0, 2
+ lvx 0, 0, 6
+ addis 6, 2, .LCPI0_10@toc@ha
+ vcmpequh 0, 0, 17
+ vadduwm 19, 16, 8
+ xxlxor 40, 51, 41
+ xxsel 45, 39, 45, 32
+ vrlw 31, 8, 3
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI0_11@toc@ha
+ addi 4, 4, .LCPI0_11@toc@l
+ vcmpequh 7, 8, 17
+ vadduwm 8, 15, 13
+ vadduwm 15, 31, 8
+ lvx 8, 0, 4
+ addi 4, 6, .LCPI0_10@toc@l
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI0_12@toc@ha
+ xxlxor 41, 47, 48
+ xxsldwi 47, 47, 47, 1
+ addi 4, 4, .LCPI0_12@toc@l
+ xxlnor 48, 39, 39
+ vrlw 29, 9, 4
+ vperm 9, 16, 16, 8
+ xxland 48, 50, 39
+ vperm 17, 30, 12, 17
+ vperm 16, 16, 16, 8
+ vmrghw 12, 12, 10
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI0_13@toc@ha
+ vadduwm 19, 29, 19
+ addi 4, 4, .LCPI0_13@toc@l
+ xxlxor 63, 51, 63
+ xxsldwi 51, 51, 51, 3
+ xxland 0, 49, 41
+ vrlw 17, 31, 5
+ xxlor 48, 0, 48
+ xxswapd 0, 61
+ vperm 18, 12, 18, 10
+ vadduwm 15, 15, 16
+ xxland 60, 48, 39
+ vadduwm 15, 17, 15
+ vperm 28, 28, 28, 8
+ xxlxor 63, 47, 0
+ vadduwm 15, 15, 18
+ vperm 31, 31, 31, 2
+ vperm 30, 18, 16, 6
+ vadduwm 19, 31, 19
+ xxlxor 44, 51, 49
+ vrlw 12, 12, 3
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 63
+ vperm 31, 13, 14, 11
+ vrlw 17, 17, 4
+ vperm 14, 14, 14, 1
+ vadduwm 15, 15, 31
+ vadduwm 19, 17, 19
+ xxswapd 0, 49
+ xxsldwi 47, 47, 47, 3
+ xxsel 46, 46, 62, 32
+ xxlxor 44, 51, 44
+ xxsldwi 51, 51, 51, 1
+ vrlw 12, 12, 5
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 19, 17, 19
+ xxlxor 44, 51, 44
+ vrlw 29, 12, 3
+ vadduwm 12, 15, 14
+ vadduwm 15, 29, 12
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI0_14@toc@ha
+ addi 4, 4, .LCPI0_14@toc@l
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ vperm 30, 13, 18, 12
+ vrlw 17, 17, 4
+ vmrghw 13, 18, 13
+ xxland 0, 62, 41
+ vadduwm 19, 17, 19
+ vperm 16, 13, 16, 10
+ xxlxor 61, 51, 61
+ xxsldwi 50, 51, 51, 3
+ xxsldwi 51, 63, 63, 3
+ vrlw 30, 29, 5
+ xxlor 61, 60, 0
+ xxswapd 0, 49
+ vperm 31, 14, 19, 11
+ vadduwm 15, 15, 29
+ vperm 19, 19, 19, 1
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 16
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 62
+ vperm 30, 16, 29, 6
+ vrlw 13, 13, 3
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 31
+ xxsldwi 63, 63, 63, 3
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 45, 50, 45
+ xxsldwi 50, 50, 50, 1
+ vrlw 13, 13, 5
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 45
+ vrlw 28, 13, 3
+ xxsel 45, 51, 62, 32
+ xxland 51, 61, 39
+ vperm 30, 14, 16, 12
+ vadduwm 15, 15, 13
+ vperm 19, 19, 19, 8
+ vmrghw 14, 16, 14
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 62, 41
+ vrlw 17, 17, 4
+ xxlor 51, 51, 0
+ vadduwm 15, 15, 19
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 60, 50, 60
+ xxsldwi 48, 50, 50, 3
+ vperm 18, 14, 29, 10
+ vrlw 30, 28, 5
+ vperm 29, 18, 19, 6
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 18
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 62
+ vperm 30, 13, 31, 11
+ vrlw 14, 14, 3
+ vperm 31, 31, 31, 1
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 46, 48, 46
+ xxsldwi 48, 48, 48, 1
+ vrlw 14, 14, 5
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 46
+ vrlw 28, 14, 3
+ xxsel 46, 63, 61, 32
+ xxland 63, 51, 39
+ vperm 29, 13, 18, 12
+ vadduwm 15, 15, 14
+ vperm 31, 31, 31, 8
+ vmrghw 13, 18, 13
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 61, 41
+ vrlw 17, 17, 4
+ xxlor 63, 63, 0
+ vperm 13, 13, 19, 10
+ xxsldwi 51, 62, 62, 3
+ vadduwm 15, 15, 31
+ vperm 30, 14, 19, 11
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 60, 48, 60
+ xxsldwi 48, 48, 48, 3
+ vrlw 29, 28, 5
+ vadduwm 15, 29, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 50, 48, 61
+ vrlw 18, 18, 3
+ vadduwm 15, 18, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 11, 17, 16
+ xxswapd 0, 49
+ xxlxor 48, 43, 50
+ xxsldwi 43, 43, 43, 1
+ vperm 18, 19, 19, 1
+ vrlw 16, 16, 5
+ vperm 19, 13, 31, 6
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 29, 17, 11
+ xxlxor 43, 61, 48
+ vrlw 16, 11, 3
+ xxsel 43, 50, 51, 32
+ xxland 50, 63, 39
+ vperm 19, 14, 13, 12
+ vadduwm 15, 15, 11
+ vperm 18, 18, 18, 8
+ vmrghw 13, 13, 14
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 51, 41
+ lvx 19, 0, 4
+ vrlw 17, 17, 4
+ xxlor 50, 50, 0
+ vperm 13, 13, 31, 10
+ xxsldwi 63, 62, 62, 3
+ vadduwm 15, 15, 18
+ vperm 19, 11, 31, 19
+ vadduwm 29, 17, 29
+ xxswapd 0, 49
+ vperm 1, 31, 31, 1
+ xxlxor 48, 61, 48
+ xxsldwi 46, 61, 61, 3
+ vperm 6, 13, 18, 6
+ vrlw 16, 16, 5
+ xxsel 32, 33, 38, 32
+ xxland 38, 50, 39
+ vadduwm 15, 16, 15
+ vperm 7, 11, 13, 12
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vperm 6, 6, 6, 8
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 3
+ vrlw 17, 17, 4
+ vadduwm 15, 15, 19
+ vadduwm 14, 17, 14
+ xxswapd 0, 49
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 5
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vadduwm 0, 15, 0
+ vperm 17, 17, 17, 2
+ xxland 0, 39, 41
+ xxlor 38, 38, 0
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 0, 16, 0
+ xxlxor 33, 32, 49
+ xxsldwi 32, 32, 32, 1
+ vrlw 1, 1, 4
+ vadduwm 0, 0, 6
+ vadduwm 8, 1, 14
+ xxswapd 0, 33
+ xxlxor 44, 40, 48
+ xxsldwi 38, 40, 40, 3
+ vrlw 7, 12, 5
+ vadduwm 0, 7, 0
+ xxlxor 33, 32, 0
+ vperm 2, 1, 1, 2
+ vmrghw 1, 13, 11
+ vadduwm 6, 2, 6
+ vperm 1, 1, 18, 10
+ xxlxor 39, 38, 39
+ vrlw 3, 7, 3
+ vadduwm 0, 0, 1
+ vadduwm 0, 3, 0
+ xxlxor 34, 32, 34
+ xxsldwi 0, 32, 32, 3
+ vrlw 2, 2, 4
+ vadduwm 4, 2, 6
+ xxswapd 2, 34
+ xxlxor 35, 36, 35
+ xxsldwi 1, 36, 36, 1
+ vrlw 3, 3, 5
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 1, 35, 2
+ stxvd2x 0, 0, 3
+ xxswapd 1, 1
+ stxvd2x 1, 3, 5
+ li 3, -16
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI1_1:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI1_2:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_3:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_4:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_5:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_6:
+ .short 1
+ .short 2
+ .short 4
+ .short 8
+ .short 16
+ .short 32
+ .short 64
+ .short 128
+.LCPI1_7:
+ .short 0
+ .short 0
+ .short 4
+ .short 8
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI1_8:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_9:
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 0
+ .short 64
+ .short 128
+.LCPI1_10:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 7
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_12:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_13:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_14:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .text
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+.Lfunc_begin1:
+ .cfi_startproc
+.Lfunc_gep1:
+ addis 2, 12, .TOC.-.Lfunc_gep1@ha
+ addi 2, 2, .TOC.-.Lfunc_gep1@l
+.Lfunc_lep1:
+ .localentry zfs_blake3_compress_xof_sse2, .Lfunc_lep1-.Lfunc_gep1
+ li 9, -80
+ mtvsrd 35, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ addis 10, 2, .LCPI1_2@toc@ha
+ vspltisw 12, 9
+ std 30, -16(1)
+ addis 12, 2, .LCPI1_8@toc@ha
+ addis 30, 2, .LCPI1_5@toc@ha
+ addis 11, 2, .LCPI1_7@toc@ha
+ stxvd2x 60, 1, 9
+ li 9, -64
+ mtvsrd 36, 7
+ lfd 2, 16(4)
+ addi 10, 10, .LCPI1_2@toc@l
+ addi 12, 12, .LCPI1_8@toc@l
+ addi 11, 11, .LCPI1_7@toc@l
+ stxvd2x 61, 1, 9
+ li 9, -48
+ lfd 3, 24(4)
+ mtvsrwz 37, 6
+ rldicl 6, 6, 32, 32
+ lvx 9, 0, 10
+ stxvd2x 62, 1, 9
+ li 9, -32
+ li 10, 32
+ stxvd2x 63, 1, 9
+ li 9, 0
+ mtvsrd 34, 9
+ xxmrghd 33, 3, 2
+ lfd 1, 8(4)
+ vmrghb 3, 2, 3
+ vmrghb 4, 2, 4
+ vspltb 2, 2, 7
+ xxmrghd 32, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ vpkudum 7, 1, 0
+ vmrglh 3, 2, 3
+ vmrglh 2, 2, 4
+ mtvsrwz 36, 6
+ addis 6, 2, .LCPI1_0@toc@ha
+ addi 6, 6, .LCPI1_0@toc@l
+ vperm 10, 1, 0, 9
+ vmrghw 4, 4, 5
+ xxswapd 37, 1
+ vmrglw 2, 2, 3
+ xxswapd 35, 0
+ lxvd2x 0, 4, 10
+ xxspltd 62, 42, 1
+ vadduwm 3, 7, 3
+ vadduwm 6, 3, 5
+ xxmrgld 36, 34, 36
+ lvx 2, 0, 6
+ addis 6, 2, .LCPI1_1@toc@ha
+ addi 6, 6, .LCPI1_1@toc@l
+ xxlxor 35, 38, 36
+ lvx 4, 0, 6
+ li 6, 48
+ lxvd2x 1, 4, 6
+ addis 4, 2, .LCPI1_3@toc@ha
+ vperm 8, 3, 3, 2
+ vspltisw 3, 10
+ addi 4, 4, .LCPI1_3@toc@l
+ xxswapd 41, 1
+ vadduwm 3, 3, 3
+ vadduwm 11, 8, 4
+ xxlxor 36, 43, 37
+ vadduwm 5, 6, 10
+ vrlw 0, 4, 3
+ vspltisw 4, 12
+ vadduwm 4, 4, 4
+ vadduwm 1, 0, 5
+ xxlxor 37, 33, 40
+ xxswapd 40, 0
+ vrlw 6, 5, 4
+ vspltisw 5, -16
+ vpkudum 13, 9, 8
+ vsubuwm 5, 12, 5
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI1_4@toc@ha
+ addi 4, 4, .LCPI1_4@toc@l
+ vadduwm 11, 6, 11
+ xxswapd 0, 38
+ vadduwm 1, 1, 13
+ xxsldwi 50, 45, 45, 1
+ xxlxor 32, 43, 32
+ xxsldwi 43, 43, 43, 3
+ xxsldwi 33, 33, 33, 1
+ vperm 12, 8, 9, 12
+ vrlw 0, 0, 5
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 0
+ vadduwm 1, 1, 12
+ vperm 6, 6, 6, 2
+ vadduwm 15, 6, 11
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI1_6@toc@ha
+ addi 4, 4, .LCPI1_6@toc@l
+ xxlxor 32, 47, 32
+ lvx 17, 0, 4
+ addi 4, 30, .LCPI1_5@toc@l
+ vperm 14, 10, 7, 11
+ vrlw 0, 0, 3
+ vadduwm 1, 0, 1
+ xxlxor 38, 33, 38
+ vrlw 6, 6, 4
+ vadduwm 8, 6, 15
+ xxswapd 0, 38
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI1_9@toc@ha
+ addi 4, 4, .LCPI1_9@toc@l
+ xxlxor 32, 40, 32
+ xxsldwi 40, 40, 40, 1
+ vperm 13, 12, 18, 6
+ vrlw 9, 0, 5
+ vadduwm 0, 1, 14
+ lvx 1, 0, 12
+ xxsldwi 46, 46, 46, 3
+ xxsldwi 32, 32, 32, 3
+ vperm 7, 7, 7, 1
+ vadduwm 15, 9, 0
+ xxlxor 32, 47, 0
+ vperm 16, 0, 0, 2
+ lvx 0, 0, 11
+ addis 11, 2, .LCPI1_10@toc@ha
+ vcmpequh 0, 0, 17
+ vadduwm 19, 16, 8
+ xxlxor 40, 51, 41
+ xxsel 45, 39, 45, 32
+ vrlw 31, 8, 3
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI1_11@toc@ha
+ addi 4, 4, .LCPI1_11@toc@l
+ vcmpequh 7, 8, 17
+ vadduwm 8, 15, 13
+ vadduwm 15, 31, 8
+ lvx 8, 0, 4
+ addi 4, 11, .LCPI1_10@toc@l
+ lvx 17, 0, 4
+ addis 4, 2, .LCPI1_12@toc@ha
+ xxlxor 41, 47, 48
+ xxsldwi 47, 47, 47, 1
+ addi 4, 4, .LCPI1_12@toc@l
+ xxlnor 48, 39, 39
+ vrlw 29, 9, 4
+ vperm 9, 16, 16, 8
+ xxland 48, 50, 39
+ vperm 17, 30, 12, 17
+ vperm 16, 16, 16, 8
+ vmrghw 12, 12, 10
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI1_13@toc@ha
+ vadduwm 19, 29, 19
+ addi 4, 4, .LCPI1_13@toc@l
+ xxlxor 63, 51, 63
+ xxsldwi 51, 51, 51, 3
+ xxland 0, 49, 41
+ vrlw 17, 31, 5
+ xxlor 48, 0, 48
+ xxswapd 0, 61
+ vperm 18, 12, 18, 10
+ vadduwm 15, 15, 16
+ xxland 60, 48, 39
+ vadduwm 15, 17, 15
+ vperm 28, 28, 28, 8
+ xxlxor 63, 47, 0
+ vadduwm 15, 15, 18
+ vperm 31, 31, 31, 2
+ vperm 30, 18, 16, 6
+ vadduwm 19, 31, 19
+ xxlxor 44, 51, 49
+ vrlw 12, 12, 3
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 63
+ vperm 31, 13, 14, 11
+ vrlw 17, 17, 4
+ vperm 14, 14, 14, 1
+ vadduwm 15, 15, 31
+ vadduwm 19, 17, 19
+ xxswapd 0, 49
+ xxsldwi 47, 47, 47, 3
+ xxsel 46, 46, 62, 32
+ xxlxor 44, 51, 44
+ xxsldwi 51, 51, 51, 1
+ vrlw 12, 12, 5
+ vadduwm 15, 12, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 19, 17, 19
+ xxlxor 44, 51, 44
+ vrlw 29, 12, 3
+ vadduwm 12, 15, 14
+ vadduwm 15, 29, 12
+ lvx 12, 0, 4
+ addis 4, 2, .LCPI1_14@toc@ha
+ addi 4, 4, .LCPI1_14@toc@l
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ vperm 30, 13, 18, 12
+ vrlw 17, 17, 4
+ vmrghw 13, 18, 13
+ xxland 0, 62, 41
+ vadduwm 19, 17, 19
+ vperm 16, 13, 16, 10
+ xxlxor 61, 51, 61
+ xxsldwi 50, 51, 51, 3
+ xxsldwi 51, 63, 63, 3
+ vrlw 30, 29, 5
+ xxlor 61, 60, 0
+ xxswapd 0, 49
+ vperm 31, 14, 19, 11
+ vadduwm 15, 15, 29
+ vperm 19, 19, 19, 1
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 16
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 62
+ vperm 30, 16, 29, 6
+ vrlw 13, 13, 3
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 31
+ xxsldwi 63, 63, 63, 3
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 45, 50, 45
+ xxsldwi 50, 50, 50, 1
+ vrlw 13, 13, 5
+ vadduwm 15, 13, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 18, 17, 18
+ xxlxor 45, 50, 45
+ vrlw 28, 13, 3
+ xxsel 45, 51, 62, 32
+ xxland 51, 61, 39
+ vperm 30, 14, 16, 12
+ vadduwm 15, 15, 13
+ vperm 19, 19, 19, 8
+ vmrghw 14, 16, 14
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 62, 41
+ vrlw 17, 17, 4
+ xxlor 51, 51, 0
+ vadduwm 15, 15, 19
+ vadduwm 18, 17, 18
+ xxswapd 0, 49
+ xxlxor 60, 50, 60
+ xxsldwi 48, 50, 50, 3
+ vperm 18, 14, 29, 10
+ vrlw 30, 28, 5
+ vperm 29, 18, 19, 6
+ vadduwm 15, 30, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 18
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 62
+ vperm 30, 13, 31, 11
+ vrlw 14, 14, 3
+ vperm 31, 31, 31, 1
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 46, 48, 46
+ xxsldwi 48, 48, 48, 1
+ vrlw 14, 14, 5
+ vadduwm 15, 14, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 46, 48, 46
+ vrlw 28, 14, 3
+ xxsel 46, 63, 61, 32
+ xxland 63, 51, 39
+ vperm 29, 13, 18, 12
+ vadduwm 15, 15, 14
+ vperm 31, 31, 31, 8
+ vmrghw 13, 18, 13
+ vadduwm 15, 28, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 61, 41
+ vrlw 17, 17, 4
+ xxlor 63, 63, 0
+ vperm 13, 13, 19, 10
+ xxsldwi 51, 62, 62, 3
+ vadduwm 15, 15, 31
+ vperm 30, 14, 19, 11
+ vadduwm 16, 17, 16
+ xxswapd 0, 49
+ xxlxor 60, 48, 60
+ xxsldwi 48, 48, 48, 3
+ vrlw 29, 28, 5
+ vadduwm 15, 29, 15
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vadduwm 16, 17, 16
+ xxlxor 50, 48, 61
+ vrlw 18, 18, 3
+ vadduwm 15, 18, 15
+ xxlxor 49, 47, 49
+ vadduwm 15, 15, 30
+ vrlw 17, 17, 4
+ xxsldwi 47, 47, 47, 3
+ vadduwm 11, 17, 16
+ xxswapd 0, 49
+ xxlxor 48, 43, 50
+ xxsldwi 43, 43, 43, 1
+ vperm 18, 19, 19, 1
+ vrlw 16, 16, 5
+ vperm 19, 13, 31, 6
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vperm 17, 17, 17, 2
+ vadduwm 29, 17, 11
+ xxlxor 43, 61, 48
+ vrlw 16, 11, 3
+ xxsel 43, 50, 51, 32
+ xxland 50, 63, 39
+ vperm 19, 14, 13, 12
+ vadduwm 15, 15, 11
+ vperm 18, 18, 18, 8
+ vmrghw 13, 13, 14
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 1
+ xxland 0, 51, 41
+ lvx 19, 0, 4
+ vrlw 17, 17, 4
+ xxlor 50, 50, 0
+ vperm 13, 13, 31, 10
+ xxsldwi 63, 62, 62, 3
+ vadduwm 15, 15, 18
+ vperm 19, 11, 31, 19
+ vadduwm 29, 17, 29
+ xxswapd 0, 49
+ vperm 1, 31, 31, 1
+ xxlxor 48, 61, 48
+ xxsldwi 46, 61, 61, 3
+ vperm 6, 13, 18, 6
+ vrlw 16, 16, 5
+ xxsel 32, 33, 38, 32
+ xxland 38, 50, 39
+ vadduwm 15, 16, 15
+ vperm 7, 11, 13, 12
+ xxlxor 49, 47, 0
+ vadduwm 15, 15, 13
+ vperm 17, 17, 17, 2
+ vperm 6, 6, 6, 8
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 49
+ xxsldwi 47, 47, 47, 3
+ vrlw 17, 17, 4
+ vadduwm 15, 15, 19
+ vadduwm 14, 17, 14
+ xxswapd 0, 49
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 5
+ vadduwm 15, 16, 15
+ xxlxor 49, 47, 0
+ vadduwm 0, 15, 0
+ vperm 17, 17, 17, 2
+ xxland 0, 39, 41
+ xxlor 38, 38, 0
+ vadduwm 14, 17, 14
+ xxlxor 48, 46, 48
+ vrlw 16, 16, 3
+ vadduwm 0, 16, 0
+ xxlxor 33, 32, 49
+ xxsldwi 32, 32, 32, 1
+ vrlw 1, 1, 4
+ vadduwm 0, 0, 6
+ vadduwm 8, 1, 14
+ xxswapd 0, 33
+ xxlxor 44, 40, 48
+ xxsldwi 38, 40, 40, 3
+ vrlw 7, 12, 5
+ vadduwm 0, 7, 0
+ xxlxor 33, 32, 0
+ vperm 2, 1, 1, 2
+ vmrghw 1, 13, 11
+ vadduwm 6, 2, 6
+ vperm 1, 1, 18, 10
+ xxlxor 39, 38, 39
+ vrlw 3, 7, 3
+ vadduwm 0, 0, 1
+ vadduwm 0, 3, 0
+ xxlxor 34, 32, 34
+ xxsldwi 0, 32, 32, 3
+ vrlw 2, 2, 4
+ vadduwm 4, 2, 6
+ xxswapd 2, 34
+ xxlxor 35, 36, 35
+ xxsldwi 1, 36, 36, 1
+ vrlw 3, 3, 5
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 3, 35, 2
+ stxvd2x 0, 0, 8
+ xxswapd 3, 3
+ stxvd2x 3, 8, 5
+ lfdx 0, 0, 3
+ lfd 3, 8(3)
+ xxmrghd 34, 3, 0
+ xxlxor 0, 1, 34
+ xxswapd 0, 0
+ stxvd2x 0, 8, 10
+ lfd 0, 16(3)
+ lfd 1, 24(3)
+ li 3, -32
+ xxmrghd 34, 1, 0
+ xxlxor 0, 2, 34
+ xxswapd 0, 0
+ stxvd2x 0, 8, 6
+ lxvd2x 63, 1, 3
+ li 3, -48
+ ld 30, -16(1)
+ lxvd2x 62, 1, 3
+ li 3, -64
+ lxvd2x 61, 1, 3
+ li 3, -80
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+
+ .globl zfs_blake3_hash_many_sse2
+ .p2align 2
+ .type zfs_blake3_hash_many_sse2,@function
+zfs_blake3_hash_many_sse2:
+.Lfunc_begin2:
+ .cfi_startproc
+.Lfunc_gep2:
+ addis 2, 12, .TOC.-.Lfunc_gep2@ha
+ addi 2, 2, .TOC.-.Lfunc_gep2@l
+.Lfunc_lep2:
+ .localentry zfs_blake3_hash_many_sse2, .Lfunc_lep2-.Lfunc_gep2
+ mfocrf 12, 32
+ mflr 0
+ std 0, 16(1)
+ stw 12, 8(1)
+ stdu 1, -256(1)
+ .cfi_def_cfa_offset 256
+ .cfi_offset lr, 16
+ .cfi_offset r17, -120
+ .cfi_offset r18, -112
+ .cfi_offset r19, -104
+ .cfi_offset r20, -96
+ .cfi_offset r21, -88
+ .cfi_offset r22, -80
+ .cfi_offset r23, -72
+ .cfi_offset r24, -64
+ .cfi_offset r25, -56
+ .cfi_offset r26, -48
+ .cfi_offset r27, -40
+ .cfi_offset r28, -32
+ .cfi_offset r29, -24
+ .cfi_offset r30, -16
+ .cfi_offset cr2, 8
+ std 26, 208(1)
+ mr 26, 4
+ cmpldi 1, 4, 4
+ andi. 4, 8, 1
+ std 18, 144(1)
+ std 19, 152(1)
+ crmove 8, 1
+ ld 19, 360(1)
+ lwz 18, 352(1)
+ std 24, 192(1)
+ std 25, 200(1)
+ std 27, 216(1)
+ std 28, 224(1)
+ mr 24, 10
+ mr 28, 6
+ mr 27, 5
+ mr 25, 3
+ std 29, 232(1)
+ std 30, 240(1)
+ mr 30, 9
+ mr 29, 7
+ std 17, 136(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ blt 1, .LBB2_3
+ li 3, 0
+ li 4, 1
+ clrldi 23, 30, 32
+ isel 22, 4, 3, 8
+ clrldi 21, 24, 32
+ clrldi 20, 18, 32
+.LBB2_2:
+ mr 3, 25
+ mr 4, 27
+ mr 5, 28
+ mr 6, 29
+ mr 7, 22
+ mr 8, 23
+ mr 9, 21
+ mr 10, 20
+ std 19, 32(1)
+ bl blake3_hash4_sse2
+ addi 26, 26, -4
+ addi 3, 29, 4
+ addi 25, 25, 32
+ addi 19, 19, 128
+ cmpldi 26, 3
+ isel 29, 3, 29, 8
+ bgt 0, .LBB2_2
+.LBB2_3:
+ cmpldi 26, 0
+ beq 0, .LBB2_11
+ li 3, 0
+ li 4, 1
+ or 21, 24, 30
+ li 20, 16
+ addi 24, 1, 96
+ isel 22, 4, 3, 8
+.LBB2_5:
+ lxvd2x 0, 28, 20
+ ld 23, 0(25)
+ mr 17, 27
+ mr 3, 21
+ stxvd2x 0, 24, 20
+ lxvd2x 0, 0, 28
+ stxvd2x 0, 0, 24
+.LBB2_6:
+ cmpldi 17, 1
+ beq 0, .LBB2_8
+ cmpldi 17, 0
+ bne 0, .LBB2_9
+ b .LBB2_10
+.LBB2_8:
+ or 3, 3, 18
+.LBB2_9:
+ clrldi 7, 3, 56
+ mr 3, 24
+ mr 4, 23
+ li 5, 64
+ mr 6, 29
+ bl zfs_blake3_compress_in_place_sse2
+ addi 23, 23, 64
+ addi 17, 17, -1
+ mr 3, 30
+ b .LBB2_6
+.LBB2_10:
+ lxvd2x 0, 24, 20
+ addi 26, 26, -1
+ add 29, 29, 22
+ addi 25, 25, 8
+ cmpldi 26, 0
+ stxvd2x 0, 19, 20
+ lxvd2x 0, 0, 24
+ stxvd2x 0, 0, 19
+ addi 19, 19, 32
+ bne 0, .LBB2_5
+.LBB2_11:
+ ld 30, 240(1)
+ ld 29, 232(1)
+ ld 28, 224(1)
+ ld 27, 216(1)
+ ld 26, 208(1)
+ ld 25, 200(1)
+ ld 24, 192(1)
+ ld 23, 184(1)
+ ld 22, 176(1)
+ ld 21, 168(1)
+ ld 20, 160(1)
+ ld 19, 152(1)
+ ld 18, 144(1)
+ ld 17, 136(1)
+ addi 1, 1, 256
+ ld 0, 16(1)
+ lwz 12, 8(1)
+ mtocrf 32, 12
+ mtlr 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_0:
+ .quad 4294967296
+ .quad 12884901890
+.LCPI3_1:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI3_2:
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+.LCPI3_3:
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+.LCPI3_4:
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+.LCPI3_5:
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .text
+ .p2align 2
+ .type blake3_hash4_sse2,@function
+blake3_hash4_sse2:
+.Lfunc_begin3:
+ .cfi_startproc
+.Lfunc_gep3:
+ addis 2, 12, .TOC.-.Lfunc_gep3@ha
+ addi 2, 2, .TOC.-.Lfunc_gep3@l
+.Lfunc_lep3:
+ .localentry blake3_hash4_sse2, .Lfunc_lep3-.Lfunc_gep3
+ stdu 1, -400(1)
+ .cfi_def_cfa_offset 400
+ .cfi_offset r22, -152
+ .cfi_offset r23, -144
+ .cfi_offset r24, -136
+ .cfi_offset r25, -128
+ .cfi_offset r26, -120
+ .cfi_offset r27, -112
+ .cfi_offset r28, -104
+ .cfi_offset r29, -96
+ .cfi_offset r30, -88
+ .cfi_offset f23, -72
+ .cfi_offset f24, -64
+ .cfi_offset f25, -56
+ .cfi_offset f26, -48
+ .cfi_offset f27, -40
+ .cfi_offset f28, -32
+ .cfi_offset f29, -24
+ .cfi_offset f30, -16
+ .cfi_offset f31, -8
+ .cfi_offset v20, -352
+ .cfi_offset v21, -336
+ .cfi_offset v22, -320
+ .cfi_offset v23, -304
+ .cfi_offset v24, -288
+ .cfi_offset v25, -272
+ .cfi_offset v26, -256
+ .cfi_offset v27, -240
+ .cfi_offset v28, -224
+ .cfi_offset v29, -208
+ .cfi_offset v30, -192
+ .cfi_offset v31, -176
+ li 11, 48
+ li 0, 8
+ std 30, 312(1)
+ li 30, 12
+ li 12, 4
+ lfiwzx 0, 0, 5
+ stxvd2x 52, 1, 11
+ li 11, 64
+ lfiwzx 2, 5, 0
+ li 0, 20
+ lfiwzx 3, 5, 30
+ stxvd2x 53, 1, 11
+ li 11, 80
+ li 30, 24
+ lfiwzx 4, 5, 0
+ li 0, 28
+ stxvd2x 54, 1, 11
+ li 11, 96
+ lfiwzx 1, 5, 12
+ lfiwzx 6, 5, 30
+ xxspltw 45, 0, 1
+ cmpldi 4, 0
+ std 22, 248(1)
+ stxvd2x 55, 1, 11
+ li 11, 112
+ lfiwzx 7, 5, 0
+ xxspltw 40, 2, 1
+ std 23, 256(1)
+ xxspltw 38, 3, 1
+ xxspltw 50, 4, 1
+ std 24, 264(1)
+ std 25, 272(1)
+ std 26, 280(1)
+ xxspltw 54, 7, 1
+ std 27, 288(1)
+ std 28, 296(1)
+ std 29, 304(1)
+ stxvd2x 56, 1, 11
+ li 11, 128
+ stfd 23, 328(1)
+ stxvd2x 57, 1, 11
+ li 11, 144
+ stfd 24, 336(1)
+ stxvd2x 58, 1, 11
+ li 11, 160
+ stfd 25, 344(1)
+ stxvd2x 59, 1, 11
+ li 11, 176
+ xxspltw 59, 1, 1
+ stxvd2x 60, 1, 11
+ li 11, 192
+ stfd 26, 352(1)
+ stxvd2x 61, 1, 11
+ li 11, 208
+ stfd 27, 360(1)
+ stxvd2x 62, 1, 11
+ li 11, 224
+ xxspltw 62, 6, 1
+ stxvd2x 63, 1, 11
+ li 11, 16
+ stfd 28, 368(1)
+ lfiwzx 5, 5, 11
+ ld 5, 432(1)
+ stfd 29, 376(1)
+ stfd 30, 384(1)
+ stfd 31, 392(1)
+ xxspltw 61, 5, 1
+ beq 0, .LBB3_5
+ addis 30, 2, .LCPI3_0@toc@ha
+ neg 7, 7
+ xxleqv 34, 34, 34
+ addis 28, 2, .LCPI3_2@toc@ha
+ addis 27, 2, .LCPI3_3@toc@ha
+ addis 26, 2, .LCPI3_4@toc@ha
+ addis 25, 2, .LCPI3_5@toc@ha
+ ld 29, 24(3)
+ addi 0, 30, .LCPI3_0@toc@l
+ mtfprwz 1, 7
+ addis 7, 2, .LCPI3_1@toc@ha
+ ld 30, 16(3)
+ lxvd2x 0, 0, 0
+ mtfprwz 2, 6
+ rldicl 6, 6, 32, 32
+ addi 0, 7, .LCPI3_1@toc@l
+ ld 7, 8(3)
+ vslw 2, 2, 2
+ lvx 5, 0, 0
+ addi 0, 28, .LCPI3_2@toc@l
+ addi 28, 27, .LCPI3_3@toc@l
+ addi 27, 26, .LCPI3_4@toc@l
+ addi 26, 25, .LCPI3_5@toc@l
+ or 25, 9, 8
+ li 9, 0
+ xxspltw 36, 2, 1
+ xxswapd 35, 0
+ xxspltw 0, 1, 1
+ xxland 35, 0, 35
+ mtfprwz 0, 6
+ ld 6, 0(3)
+ addi 3, 3, -8
+ vadduwm 4, 3, 4
+ xxlor 35, 35, 34
+ xxlxor 34, 36, 34
+ xxlor 9, 36, 36
+ vspltisw 4, 4
+ vcmpgtsw 2, 3, 2
+ xxspltw 35, 0, 1
+ xxlor 10, 36, 36
+ vsubuwm 2, 3, 2
+ xxlor 11, 34, 34
+ lvx 2, 0, 0
+ li 0, 32
+ xxlor 12, 34, 34
+ lvx 2, 0, 28
+ li 28, 48
+ xxlor 13, 34, 34
+ lvx 2, 0, 27
+ li 27, 0
+ xxlor 31, 34, 34
+ lvx 2, 0, 26
+ xxlor 30, 34, 34
+.LBB3_2:
+ mr 26, 27
+ addi 27, 27, 1
+ xxlor 28, 40, 40
+ cmpld 27, 4
+ sldi 26, 26, 6
+ xxlor 24, 45, 45
+ iseleq 24, 10, 9
+ add 23, 6, 26
+ add 22, 30, 26
+ lxvd2x 0, 6, 26
+ lxvd2x 1, 7, 26
+ or 25, 24, 25
+ add 24, 7, 26
+ lxvd2x 2, 30, 26
+ lxvd2x 3, 29, 26
+ xxlor 29, 38, 38
+ lxvd2x 4, 23, 11
+ lxvd2x 6, 24, 11
+ clrlwi 25, 25, 24
+ lxvd2x 7, 22, 11
+ lxvd2x 8, 23, 0
+ mtfprd 5, 25
+ add 25, 29, 26
+ xxswapd 34, 0
+ lxvd2x 0, 25, 11
+ xxswapd 36, 1
+ xxswapd 33, 2
+ lxvd2x 1, 24, 0
+ lxvd2x 2, 22, 0
+ xxswapd 39, 3
+ xxswapd 32, 4
+ lxvd2x 3, 25, 0
+ lxvd2x 4, 23, 28
+ xxswapd 49, 6
+ xxswapd 51, 7
+ lxvd2x 6, 24, 28
+ xxswapd 58, 8
+ lxvd2x 7, 22, 28
+ lxvd2x 8, 25, 28
+ xxswapd 60, 0
+ mr 25, 3
+ xxswapd 57, 1
+ xxswapd 53, 2
+ xxswapd 52, 3
+ xxswapd 56, 4
+ xxswapd 55, 6
+ xxswapd 0, 5
+ xxswapd 40, 7
+ xxswapd 41, 8
+ mtctr 12
+.LBB3_3:
+ ldu 24, 8(25)
+ add 24, 24, 26
+ addi 24, 24, 256
+ dcbt 0, 24
+ bdnz .LBB3_3
+ vmrgew 3, 4, 2
+ vspltisw 31, 9
+ mr 25, 8
+ vmrglw 10, 4, 2
+ vspltisw 14, 10
+ vmrghw 6, 4, 2
+ xxspltw 0, 0, 3
+ vmrgew 4, 17, 0
+ vmrglw 11, 17, 0
+ vmrghw 16, 17, 0
+ vmrgew 0, 25, 26
+ vmrgew 13, 7, 1
+ vmrglw 2, 7, 1
+ vmrghw 7, 7, 1
+ xxlor 25, 36, 36
+ vmrgew 4, 28, 19
+ xxlor 26, 32, 32
+ vmrglw 0, 25, 26
+ vmrglw 1, 28, 19
+ xxmrgld 47, 34, 42
+ xxlor 44, 28, 28
+ vmrghw 25, 25, 26
+ xxlor 23, 36, 36
+ vmrghw 4, 28, 19
+ vspltisw 19, -16
+ xxlor 5, 32, 32
+ vmrgew 0, 20, 21
+ xxmrgld 34, 33, 43
+ vmrglw 28, 20, 21
+ vmrghw 21, 20, 21
+ vmrglw 20, 23, 24
+ vmrghw 26, 23, 24
+ vmrglw 17, 9, 8
+ xxlor 8, 32, 32
+ vmrgew 0, 23, 24
+ xxmrgld 56, 39, 38
+ vmrgew 23, 9, 8
+ xxlor 33, 24, 24
+ xxlor 2, 34, 34
+ vadduwm 11, 15, 1
+ xxmrgld 33, 36, 48
+ xxlor 6, 47, 47
+ xxlor 27, 32, 32
+ vmrghw 0, 9, 8
+ vspltisw 9, 12
+ vsubuwm 8, 31, 19
+ xxmrgld 51, 23, 25
+ vadduwm 31, 2, 12
+ xxlor 34, 10, 10
+ vadduwm 10, 14, 14
+ vslw 15, 2, 2
+ xxlor 34, 29, 29
+ vadduwm 14, 24, 27
+ xxlor 24, 48, 48
+ vadduwm 16, 1, 2
+ xxmrgld 34, 45, 35
+ vadduwm 31, 31, 30
+ xxmrghd 36, 36, 24
+ vadduwm 11, 11, 29
+ vadduwm 14, 14, 18
+ vadduwm 13, 16, 22
+ xxlxor 47, 63, 47
+ xxlor 1, 9, 9
+ xxlor 1, 11, 11
+ xxlxor 48, 43, 9
+ vadduwm 11, 11, 2
+ xxlor 7, 34, 34
+ xxmrghd 34, 39, 38
+ xxlxor 39, 46, 11
+ xxlor 1, 50, 50
+ xxlxor 50, 45, 0
+ vperm 15, 15, 15, 5
+ vperm 16, 16, 16, 5
+ vperm 7, 7, 7, 5
+ vperm 18, 18, 18, 5
+ xxlor 4, 33, 33
+ xxlor 33, 31, 31
+ vadduwm 14, 14, 2
+ xxlor 3, 34, 34
+ xxlor 34, 12, 12
+ xxlor 35, 13, 13
+ vadduwm 6, 15, 1
+ xxlor 33, 30, 30
+ vadduwm 2, 16, 2
+ vadduwm 3, 7, 3
+ vadduwm 12, 18, 1
+ xxlxor 59, 34, 61
+ xxlxor 61, 35, 1
+ xxlxor 33, 38, 62
+ xxlxor 62, 44, 54
+ vrlw 22, 27, 10
+ vrlw 29, 29, 10
+ vrlw 1, 1, 10
+ vrlw 30, 30, 10
+ vadduwm 31, 31, 19
+ vadduwm 13, 13, 4
+ vadduwm 11, 22, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 1, 31
+ vadduwm 13, 30, 13
+ vadduwm 9, 9, 9
+ xxlor 1, 36, 36
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 39
+ xxmrgld 39, 60, 5
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 28, 4, 9
+ xxmrgld 36, 53, 57
+ vrlw 15, 15, 9
+ xxmrghd 57, 53, 57
+ vrlw 18, 18, 9
+ vadduwm 14, 14, 4
+ xxlor 0, 36, 36
+ xxmrgld 36, 49, 52
+ vadduwm 2, 16, 2
+ xxmrgld 49, 8, 26
+ vadduwm 3, 28, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 54, 34, 54
+ xxlxor 61, 35, 61
+ xxlxor 33, 38, 33
+ xxlxor 62, 44, 62
+ vrlw 29, 29, 8
+ vrlw 20, 1, 8
+ xxmrgld 33, 55, 27
+ vrlw 30, 30, 8
+ vrlw 22, 22, 8
+ vadduwm 11, 11, 7
+ xxlor 5, 39, 39
+ xxmrgld 39, 32, 58
+ vadduwm 31, 31, 4
+ vadduwm 11, 29, 11
+ vadduwm 13, 13, 7
+ vadduwm 14, 20, 14
+ vadduwm 31, 30, 31
+ vadduwm 13, 22, 13
+ xxlor 28, 36, 36
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 60
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vadduwm 11, 11, 17
+ vmr 28, 17
+ xxmrghd 49, 32, 58
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 21, 4, 2
+ vadduwm 3, 15, 3
+ xxlxor 34, 38, 61
+ xxlxor 61, 44, 52
+ xxlxor 62, 53, 62
+ xxlxor 54, 35, 54
+ vrlw 20, 2, 10
+ vrlw 29, 29, 10
+ vrlw 0, 30, 10
+ vrlw 30, 22, 10
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 17
+ vadduwm 11, 20, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 24
+ xxlor 8, 56, 56
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 21
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 52
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 25, 51, 51
+ vmr 26, 17
+ xxlor 49, 3, 3
+ xxlor 52, 1, 1
+ xxlor 51, 2, 2
+ vadduwm 14, 14, 17
+ vadduwm 31, 31, 20
+ vadduwm 13, 13, 19
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 29, 39, 39
+ xxlor 59, 4, 4
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 53, 0, 0
+ xxlor 39, 6, 6
+ vadduwm 11, 11, 27
+ vadduwm 14, 14, 21
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 1
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 34, 7, 7
+ vadduwm 31, 31, 28
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 11, 11, 2
+ xxlor 34, 28, 28
+ vadduwm 13, 13, 26
+ vadduwm 14, 14, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 2, 58, 58
+ xxlor 39, 25, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 54, 29, 29
+ xxlor 58, 5, 5
+ vadduwm 11, 11, 25
+ vadduwm 14, 14, 7
+ vadduwm 31, 31, 22
+ vadduwm 13, 13, 26
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 21
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 20
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 0, 33, 33
+ xxlor 33, 8, 8
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vadduwm 11, 11, 19
+ vadduwm 14, 14, 2
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 22
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ vadduwm 11, 11, 27
+ vadduwm 14, 14, 28
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 31, 31, 25
+ vadduwm 13, 13, 26
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 3, 7, 7
+ vadduwm 11, 11, 7
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 33, 6, 6
+ xxlor 58, 2, 2
+ xxlor 39, 3, 3
+ vadduwm 14, 14, 1
+ vadduwm 31, 31, 26
+ vadduwm 13, 13, 7
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ xxlor 52, 0, 0
+ vadduwm 11, 11, 21
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 14, 14, 2
+ vadduwm 31, 31, 22
+ vadduwm 13, 13, 20
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 7, 49, 49
+ vmr 17, 2
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 54, 1, 1
+ xxlor 34, 7, 7
+ vadduwm 11, 11, 22
+ vadduwm 14, 14, 28
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 26
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 59, 25, 25
+ vadduwm 11, 11, 19
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 27
+ vadduwm 13, 13, 7
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vmr 2, 19
+ xxlor 0, 7, 7
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ xxlor 1, 51, 51
+ xxlor 7, 39, 39
+ xxlor 51, 8, 8
+ xxlor 39, 5, 5
+ xxlor 34, 4, 4
+ vadduwm 11, 11, 1
+ vadduwm 14, 14, 19
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ xxlor 2, 53, 53
+ vmr 21, 28
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 53, 29, 29
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 28
+ vadduwm 31, 31, 26
+ vadduwm 13, 13, 21
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ vadduwm 11, 11, 20
+ xxlor 5, 52, 52
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 52, 2, 2
+ vadduwm 14, 14, 25
+ vadduwm 31, 31, 20
+ vadduwm 13, 13, 7
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ vadduwm 11, 11, 22
+ vadduwm 14, 14, 27
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 31, 31, 1
+ vadduwm 13, 13, 2
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 3, 29, 29
+ xxlor 4, 49, 49
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ vmr 17, 28
+ xxlor 2, 54, 54
+ xxlor 3, 34, 34
+ xxlor 34, 8, 8
+ xxlor 51, 0, 0
+ xxlor 60, 7, 7
+ xxlor 54, 1, 1
+ vadduwm 11, 11, 2
+ vadduwm 14, 14, 19
+ vadduwm 31, 31, 28
+ vadduwm 13, 13, 22
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 56, 32
+ vrlw 30, 30, 8
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 26
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 6, 39, 39
+ xxlor 39, 4, 4
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 30, 30, 10
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vadduwm 11, 11, 21
+ vadduwm 14, 14, 27
+ vadduwm 31, 31, 7
+ vadduwm 13, 13, 28
+ vadduwm 11, 30, 11
+ vadduwm 14, 23, 14
+ vadduwm 31, 29, 31
+ vadduwm 13, 0, 13
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 63, 47
+ xxlxor 50, 45, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 0, 49, 49
+ xxlor 49, 5, 5
+ vadduwm 24, 16, 24
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 62, 56, 62
+ xxlxor 55, 35, 55
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 23, 23, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ vrlw 30, 30, 8
+ vadduwm 11, 11, 17
+ vadduwm 14, 14, 1
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 22
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 34, 3, 3
+ xxlor 49, 2, 2
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 24, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 55, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 32, 56, 32
+ xxlxor 62, 35, 62
+ vrlw 23, 23, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ vrlw 30, 30, 10
+ vadduwm 11, 11, 19
+ vadduwm 14, 14, 20
+ vadduwm 31, 31, 2
+ vadduwm 13, 13, 17
+ vadduwm 11, 23, 11
+ vadduwm 14, 29, 14
+ vadduwm 31, 0, 31
+ vadduwm 13, 30, 13
+ xxlxor 50, 43, 50
+ xxlxor 48, 46, 48
+ xxlxor 36, 63, 36
+ xxlxor 47, 45, 47
+ vrlw 18, 18, 9
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vadduwm 14, 14, 27
+ vadduwm 11, 11, 25
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 27, 4, 24
+ vadduwm 3, 15, 3
+ xxlxor 57, 38, 55
+ xxlxor 61, 44, 61
+ xxlxor 62, 35, 62
+ xxlxor 32, 59, 32
+ xxlor 39, 7, 7
+ vrlw 30, 30, 8
+ vrlw 25, 25, 8
+ vrlw 29, 29, 8
+ vrlw 0, 0, 8
+ xxlor 1, 58, 58
+ vmr 26, 19
+ vadduwm 19, 31, 7
+ xxlor 39, 6, 6
+ vadduwm 11, 30, 11
+ vadduwm 7, 13, 7
+ vadduwm 13, 25, 14
+ vadduwm 14, 29, 19
+ vadduwm 7, 0, 7
+ xxlxor 48, 43, 48
+ xxlxor 36, 45, 36
+ xxlxor 47, 46, 47
+ xxlxor 50, 39, 50
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ vperm 18, 18, 18, 5
+ xxlor 51, 1, 1
+ vadduwm 13, 13, 1
+ vadduwm 11, 11, 19
+ vadduwm 19, 16, 27
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 63, 51, 62
+ xxlxor 62, 35, 57
+ xxlxor 61, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 31, 31, 10
+ vrlw 30, 30, 10
+ vrlw 29, 29, 10
+ vrlw 0, 0, 10
+ xxlor 33, 0, 0
+ vadduwm 7, 7, 2
+ vadduwm 14, 14, 1
+ vadduwm 11, 31, 11
+ vadduwm 13, 30, 13
+ vadduwm 14, 29, 14
+ vadduwm 7, 0, 7
+ xxlxor 48, 43, 48
+ xxlxor 36, 45, 36
+ xxlxor 47, 46, 47
+ xxlxor 50, 39, 50
+ vrlw 16, 16, 9
+ vrlw 4, 4, 9
+ vrlw 15, 15, 9
+ vrlw 18, 18, 9
+ xxlor 60, 8, 8
+ vadduwm 1, 11, 21
+ vadduwm 11, 13, 28
+ vadduwm 13, 16, 19
+ vadduwm 3, 4, 3
+ vadduwm 6, 15, 6
+ vadduwm 12, 18, 12
+ xxlxor 51, 45, 63
+ xxlxor 63, 35, 62
+ xxlxor 62, 38, 61
+ xxlxor 32, 44, 32
+ vrlw 31, 31, 8
+ vrlw 30, 30, 8
+ vrlw 0, 0, 8
+ vrlw 19, 19, 8
+ vadduwm 14, 14, 26
+ vadduwm 7, 7, 17
+ vadduwm 1, 31, 1
+ vadduwm 11, 30, 11
+ vadduwm 14, 0, 14
+ vadduwm 7, 19, 7
+ xxlxor 50, 33, 50
+ xxlxor 48, 43, 48
+ xxlxor 36, 46, 36
+ xxlxor 47, 39, 47
+ vperm 18, 18, 18, 5
+ vperm 16, 16, 16, 5
+ vperm 4, 4, 4, 5
+ vperm 15, 15, 15, 5
+ xxlor 34, 4, 4
+ vadduwm 14, 14, 22
+ vadduwm 6, 18, 6
+ vadduwm 12, 16, 12
+ vadduwm 13, 4, 13
+ vadduwm 3, 15, 3
+ xxlxor 49, 38, 63
+ xxlxor 63, 44, 62
+ xxlxor 32, 45, 32
+ xxlxor 51, 35, 51
+ vrlw 17, 17, 10
+ vrlw 31, 31, 10
+ vrlw 0, 0, 10
+ vrlw 10, 19, 10
+ vadduwm 11, 11, 2
+ xxlor 34, 5, 5
+ vadduwm 1, 1, 20
+ vadduwm 2, 7, 2
+ vadduwm 7, 31, 11
+ vadduwm 11, 0, 14
+ vadduwm 2, 10, 2
+ vadduwm 1, 17, 1
+ xxlxor 36, 43, 36
+ xxlxor 46, 34, 47
+ vrlw 4, 4, 9
+ vrlw 14, 14, 9
+ xxlxor 47, 33, 50
+ xxlxor 48, 39, 48
+ vrlw 15, 15, 9
+ vrlw 9, 16, 9
+ vadduwm 13, 4, 13
+ vadduwm 3, 14, 3
+ xxlxor 32, 45, 32
+ xxlxor 45, 45, 33
+ xxlxor 33, 35, 42
+ xxlxor 59, 35, 39
+ vadduwm 3, 15, 6
+ vadduwm 6, 9, 12
+ xxlxor 39, 35, 49
+ xxlxor 42, 38, 63
+ vrlw 1, 1, 8
+ vrlw 7, 7, 8
+ vrlw 10, 10, 8
+ vrlw 0, 0, 8
+ xxlxor 40, 35, 43
+ xxlxor 38, 38, 34
+ xxlxor 61, 33, 41
+ xxlxor 50, 39, 36
+ xxlxor 62, 42, 46
+ xxlxor 54, 32, 47
+ bne 0, .LBB3_2
+.LBB3_5:
+ vmrglw 2, 27, 13
+ li 3, 32
+ li 4, 48
+ vmrglw 4, 6, 8
+ vmrglw 0, 18, 29
+ vmrglw 1, 22, 30
+ vmrghw 3, 27, 13
+ vmrghw 5, 6, 8
+ vmrghw 6, 18, 29
+ vmrghw 7, 22, 30
+ xxmrgld 40, 36, 34
+ xxmrghd 34, 36, 34
+ xxmrgld 41, 33, 32
+ xxswapd 0, 40
+ xxmrgld 36, 37, 35
+ xxmrghd 35, 37, 35
+ xxmrghd 37, 33, 32
+ xxswapd 1, 41
+ xxmrgld 32, 39, 38
+ xxmrghd 33, 39, 38
+ xxswapd 2, 34
+ xxswapd 4, 36
+ xxswapd 3, 37
+ stxvd2x 0, 0, 5
+ xxswapd 5, 32
+ stxvd2x 1, 5, 11
+ xxswapd 0, 35
+ xxswapd 1, 33
+ stxvd2x 2, 5, 3
+ li 3, 64
+ stxvd2x 3, 5, 4
+ li 4, 80
+ stxvd2x 4, 5, 3
+ li 3, 96
+ stxvd2x 5, 5, 4
+ li 4, 112
+ stxvd2x 0, 5, 3
+ stxvd2x 1, 5, 4
+ li 3, 224
+ lxvd2x 63, 1, 3
+ li 3, 208
+ lfd 31, 392(1)
+ ld 30, 312(1)
+ ld 29, 304(1)
+ lxvd2x 62, 1, 3
+ li 3, 192
+ lfd 30, 384(1)
+ ld 28, 296(1)
+ ld 27, 288(1)
+ lxvd2x 61, 1, 3
+ li 3, 176
+ lfd 29, 376(1)
+ ld 26, 280(1)
+ ld 25, 272(1)
+ lxvd2x 60, 1, 3
+ li 3, 160
+ lfd 28, 368(1)
+ ld 24, 264(1)
+ ld 23, 256(1)
+ lxvd2x 59, 1, 3
+ li 3, 144
+ lfd 27, 360(1)
+ ld 22, 248(1)
+ lxvd2x 58, 1, 3
+ li 3, 128
+ lfd 26, 352(1)
+ lxvd2x 57, 1, 3
+ li 3, 112
+ lfd 25, 344(1)
+ lxvd2x 56, 1, 3
+ li 3, 96
+ lfd 24, 336(1)
+ lxvd2x 55, 1, 3
+ li 3, 80
+ lfd 23, 328(1)
+ lxvd2x 54, 1, 3
+ li 3, 64
+ lxvd2x 53, 1, 3
+ li 3, 48
+ lxvd2x 52, 1, 3
+ addi 1, 1, 400
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size blake3_hash4_sse2, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
new file mode 100644
index 000000000..a8b2627f1
--- /dev/null
+++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S
@@ -0,0 +1,3064 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2022 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ *
+ * This is converted assembly: SSE4.1 -> POWER8 PPC64 Little Endian
+ * Used tools: SIMDe https://github.com/simd-everywhere/simde
+ */
+
+#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__))
+ .text
+ .abiversion 2
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI0_0:
+ .byte 31
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 30
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 29
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 28
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI0_2:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI0_3:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI0_4:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_5:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI0_6:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+.LCPI0_7:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_8:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_9:
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_10:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+.LCPI0_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI0_12:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI0_13:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI0_14:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .text
+ .globl zfs_blake3_compress_in_place_sse41
+ .p2align 2
+ .type zfs_blake3_compress_in_place_sse41,@function
+zfs_blake3_compress_in_place_sse41:
+.Lfunc_begin0:
+ .cfi_startproc
+.Lfunc_gep0:
+ addis 2, 12, .TOC.-.Lfunc_gep0@ha
+ addi 2, 2, .TOC.-.Lfunc_gep0@l
+.Lfunc_lep0:
+ .localentry zfs_blake3_compress_in_place_sse41, .Lfunc_lep0-.Lfunc_gep0
+ li 8, -64
+ mtvsrd 34, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 13, -16
+ stxvd2x 60, 1, 8
+ li 8, -48
+ mtvsrd 35, 7
+ lfd 2, 16(4)
+ lfd 3, 24(4)
+ addis 7, 2, .LCPI0_0@toc@ha
+ stxvd2x 61, 1, 8
+ li 8, -32
+ mtvsrwz 36, 6
+ rldicl 6, 6, 32, 32
+ stxvd2x 62, 1, 8
+ li 8, -16
+ vmrghb 2, 3, 2
+ stxvd2x 63, 1, 8
+ mtvsrwz 35, 6
+ addi 6, 7, .LCPI0_0@toc@l
+ addis 7, 2, .LCPI0_2@toc@ha
+ lfd 1, 8(4)
+ xxmrghd 32, 3, 2
+ lvx 6, 0, 6
+ xxlxor 33, 33, 33
+ addis 6, 2, .LCPI0_1@toc@ha
+ addi 7, 7, .LCPI0_2@toc@l
+ vmrghw 3, 3, 4
+ addi 6, 6, .LCPI0_1@toc@l
+ vspltisw 14, 9
+ xxmrghd 37, 1, 0
+ lxvd2x 0, 0, 3
+ lxvd2x 1, 3, 5
+ vperm 2, 1, 2, 6
+ vpkudum 9, 0, 5
+ xxswapd 36, 0
+ xxswapd 38, 1
+ xxmrgld 34, 34, 35
+ lvx 3, 0, 7
+ addis 7, 2, .LCPI0_4@toc@ha
+ addi 7, 7, .LCPI0_4@toc@l
+ vadduwm 4, 9, 4
+ lvx 11, 0, 7
+ addis 7, 2, .LCPI0_6@toc@ha
+ addi 7, 7, .LCPI0_6@toc@l
+ vadduwm 7, 4, 6
+ lvx 4, 0, 6
+ addis 6, 2, .LCPI0_3@toc@ha
+ addi 6, 6, .LCPI0_3@toc@l
+ vperm 11, 0, 5, 11
+ lvx 0, 0, 7
+ li 7, 48
+ xxlxor 40, 39, 34
+ lvx 10, 0, 6
+ addis 6, 2, .LCPI0_5@toc@ha
+ lxvd2x 1, 4, 7
+ vcmpgtsb 2, 1, 4
+ addi 6, 6, .LCPI0_5@toc@l
+ vperm 4, 8, 8, 3
+ vspltisw 8, 10
+ xxlandc 44, 36, 34
+ vadduwm 4, 8, 8
+ vadduwm 8, 12, 10
+ xxlxor 37, 40, 38
+ vrlw 6, 5, 4
+ vadduwm 5, 7, 11
+ vadduwm 7, 6, 5
+ lvx 5, 0, 6
+ li 6, 32
+ lxvd2x 0, 4, 6
+ addis 4, 2, .LCPI0_7@toc@ha
+ addis 6, 2, .LCPI0_9@toc@ha
+ xxlxor 42, 39, 44
+ xxswapd 44, 1
+ addi 4, 4, .LCPI0_7@toc@l
+ addi 6, 6, .LCPI0_9@toc@l
+ vcmpgtsb 5, 1, 5
+ vperm 1, 10, 10, 0
+ xxswapd 42, 0
+ vpkudum 16, 12, 10
+ xxlandc 47, 33, 37
+ vsubuwm 1, 14, 13
+ lvx 14, 0, 4
+ addis 4, 2, .LCPI0_8@toc@ha
+ vadduwm 8, 15, 8
+ xxswapd 45, 47
+ addi 4, 4, .LCPI0_8@toc@l
+ vadduwm 7, 7, 16
+ xxsldwi 48, 48, 48, 1
+ xxlxor 38, 40, 38
+ xxsldwi 40, 40, 40, 3
+ xxsldwi 39, 39, 39, 1
+ vperm 14, 10, 12, 14
+ vrlw 6, 6, 1
+ vadduwm 7, 6, 7
+ xxlxor 45, 39, 45
+ vperm 13, 13, 13, 3
+ xxlandc 45, 45, 34
+ vadduwm 8, 13, 8
+ xxlxor 38, 40, 38
+ vrlw 10, 6, 4
+ vadduwm 6, 7, 14
+ vadduwm 7, 10, 6
+ xxlxor 38, 39, 45
+ vperm 12, 6, 6, 0
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI0_10@toc@ha
+ addi 4, 4, .LCPI0_10@toc@l
+ vperm 13, 11, 9, 6
+ xxlandc 44, 44, 37
+ vadduwm 15, 12, 8
+ vadduwm 7, 7, 13
+ xxsldwi 45, 45, 45, 3
+ xxlxor 40, 47, 42
+ xxsldwi 47, 47, 47, 1
+ xxsldwi 39, 39, 39, 3
+ vrlw 10, 8, 1
+ xxswapd 40, 44
+ vadduwm 17, 10, 7
+ lvx 7, 0, 4
+ addis 4, 2, .LCPI0_11@toc@ha
+ addi 4, 4, .LCPI0_11@toc@l
+ xxlxor 44, 49, 40
+ lvx 8, 0, 6
+ vperm 18, 9, 9, 7
+ lvx 9, 0, 4
+ addis 4, 2, .LCPI0_12@toc@ha
+ vperm 12, 12, 12, 3
+ addi 4, 4, .LCPI0_12@toc@l
+ vperm 19, 14, 16, 8
+ xxlandc 63, 44, 34
+ vperm 12, 19, 18, 9
+ vadduwm 15, 31, 15
+ xxlxor 42, 47, 42
+ vrlw 18, 10, 4
+ vadduwm 10, 17, 12
+ vadduwm 17, 18, 10
+ xxlxor 42, 49, 63
+ xxmrgld 63, 43, 46
+ xxsldwi 49, 49, 49, 1
+ vmrghw 14, 14, 11
+ vperm 19, 10, 10, 0
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI0_13@toc@ha
+ addi 4, 4, .LCPI0_13@toc@l
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI0_14@toc@ha
+ vperm 31, 16, 31, 10
+ addi 4, 4, .LCPI0_14@toc@l
+ vperm 14, 14, 16, 11
+ xxlandc 51, 51, 37
+ vadduwm 15, 19, 15
+ xxswapd 51, 51
+ vadduwm 17, 17, 31
+ xxlxor 50, 47, 50
+ xxsldwi 47, 47, 47, 3
+ vperm 30, 14, 31, 8
+ vrlw 18, 18, 1
+ vadduwm 17, 18, 17
+ xxlxor 51, 49, 51
+ vadduwm 17, 17, 14
+ vperm 19, 19, 19, 3
+ xxlandc 51, 51, 34
+ vadduwm 15, 19, 15
+ xxlxor 48, 47, 50
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 51
+ vperm 19, 12, 13, 6
+ vperm 18, 18, 18, 0
+ vperm 13, 13, 13, 7
+ vadduwm 17, 17, 19
+ xxlandc 50, 50, 37
+ xxsldwi 49, 49, 49, 3
+ vperm 13, 30, 13, 9
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxmrgld 62, 44, 46
+ vmrghw 12, 14, 12
+ xxlxor 48, 47, 48
+ xxsldwi 47, 47, 47, 1
+ vrlw 16, 16, 1
+ vperm 30, 31, 30, 10
+ vperm 12, 12, 31, 11
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 13
+ vperm 18, 18, 18, 3
+ vperm 31, 12, 30, 8
+ xxlandc 50, 50, 34
+ vadduwm 15, 18, 15
+ xxlxor 48, 47, 48
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ xxsldwi 49, 49, 49, 1
+ vperm 18, 18, 18, 0
+ vadduwm 17, 17, 30
+ xxlandc 50, 50, 37
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxlxor 48, 47, 48
+ xxsldwi 46, 47, 47, 3
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 12
+ vperm 18, 18, 18, 3
+ xxlandc 47, 50, 34
+ xxsldwi 50, 51, 51, 3
+ vadduwm 14, 15, 14
+ vperm 19, 13, 18, 6
+ xxlxor 48, 46, 48
+ vperm 18, 18, 18, 7
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vadduwm 17, 17, 19
+ vperm 15, 15, 15, 0
+ xxsldwi 49, 49, 49, 3
+ xxlandc 47, 47, 37
+ vadduwm 14, 15, 14
+ xxswapd 47, 47
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 29, 15, 14
+ vperm 14, 31, 18, 9
+ xxmrgld 50, 45, 44
+ xxlxor 48, 61, 48
+ vmrghw 12, 12, 13
+ vrlw 16, 16, 4
+ vperm 18, 30, 18, 10
+ vadduwm 17, 17, 14
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ xxsldwi 49, 49, 49, 1
+ vperm 15, 15, 15, 0
+ vadduwm 17, 17, 18
+ xxlandc 47, 47, 37
+ vadduwm 31, 15, 29
+ xxswapd 47, 47
+ xxlxor 48, 63, 48
+ xxsldwi 45, 63, 63, 3
+ vperm 31, 12, 30, 11
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 13, 15, 13
+ xxlxor 44, 45, 48
+ vadduwm 16, 17, 31
+ xxsldwi 49, 51, 51, 3
+ vrlw 12, 12, 4
+ vperm 19, 14, 17, 6
+ vadduwm 16, 12, 16
+ xxlxor 47, 48, 47
+ vperm 15, 15, 15, 0
+ xxlandc 47, 47, 37
+ vadduwm 13, 15, 13
+ xxswapd 47, 47
+ xxlxor 44, 45, 44
+ xxsldwi 45, 45, 45, 1
+ vrlw 30, 12, 1
+ vadduwm 12, 16, 19
+ xxsldwi 44, 44, 44, 3
+ vadduwm 16, 30, 12
+ xxlxor 44, 48, 47
+ vperm 15, 17, 17, 7
+ vperm 12, 12, 12, 3
+ vperm 17, 31, 18, 8
+ xxlandc 61, 44, 34
+ vperm 12, 17, 15, 9
+ vadduwm 13, 29, 13
+ xxlxor 47, 45, 62
+ xxmrgld 62, 46, 63
+ vmrghw 14, 31, 14
+ vrlw 15, 15, 4
+ vadduwm 16, 16, 12
+ vperm 30, 18, 30, 10
+ vperm 14, 14, 18, 11
+ xxsldwi 50, 51, 51, 3
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 61
+ xxsldwi 48, 48, 48, 1
+ vperm 19, 12, 18, 6
+ vperm 17, 17, 17, 0
+ vadduwm 16, 16, 30
+ xxmrgld 60, 44, 46
+ vmrghw 12, 14, 12
+ vperm 28, 30, 28, 10
+ xxlandc 49, 49, 37
+ vadduwm 13, 17, 13
+ xxswapd 49, 49
+ vperm 12, 12, 30, 11
+ xxlxor 47, 45, 47
+ xxsldwi 45, 45, 45, 3
+ vrlw 15, 15, 1
+ vperm 8, 12, 28, 8
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vadduwm 16, 16, 14
+ vperm 17, 17, 17, 3
+ xxlandc 49, 49, 34
+ vadduwm 13, 17, 13
+ xxlxor 47, 45, 47
+ vrlw 15, 15, 4
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vperm 17, 17, 17, 0
+ xxlandc 49, 49, 37
+ vadduwm 31, 17, 13
+ xxlxor 45, 63, 47
+ vrlw 15, 13, 1
+ vadduwm 13, 16, 19
+ xxswapd 48, 49
+ xxsldwi 51, 51, 51, 3
+ xxsldwi 45, 45, 45, 3
+ vadduwm 17, 15, 13
+ xxlxor 45, 49, 48
+ lvx 16, 0, 4
+ vperm 29, 13, 13, 3
+ vperm 13, 18, 18, 7
+ xxsldwi 50, 63, 63, 1
+ vperm 16, 14, 30, 16
+ vperm 7, 19, 19, 7
+ xxlandc 63, 61, 34
+ vadduwm 18, 31, 18
+ vperm 29, 16, 13, 9
+ xxlxor 47, 50, 47
+ vperm 6, 16, 19, 6
+ vrlw 15, 15, 4
+ vperm 7, 8, 7, 9
+ vadduwm 17, 17, 29
+ xxmrgld 41, 61, 44
+ vadduwm 17, 15, 17
+ vperm 9, 28, 9, 10
+ xxlxor 63, 49, 63
+ xxsldwi 49, 49, 49, 1
+ vperm 31, 31, 31, 0
+ vadduwm 17, 17, 28
+ xxlandc 63, 63, 37
+ vadduwm 18, 31, 18
+ xxswapd 63, 63
+ xxlxor 47, 50, 47
+ xxsldwi 46, 50, 50, 3
+ vrlw 15, 15, 1
+ vadduwm 17, 15, 17
+ xxlxor 63, 49, 63
+ vadduwm 17, 17, 12
+ vperm 31, 31, 31, 3
+ xxlandc 50, 63, 34
+ vadduwm 14, 18, 14
+ xxlxor 47, 46, 47
+ vrlw 15, 15, 4
+ vadduwm 17, 15, 17
+ xxlxor 50, 49, 50
+ vadduwm 6, 17, 6
+ vperm 18, 18, 18, 0
+ xxsldwi 38, 38, 38, 3
+ xxlandc 50, 50, 37
+ vadduwm 14, 18, 14
+ xxswapd 48, 50
+ xxlxor 47, 46, 47
+ xxsldwi 46, 46, 46, 1
+ vrlw 15, 15, 1
+ vadduwm 6, 15, 6
+ xxlxor 48, 38, 48
+ vadduwm 6, 6, 7
+ vperm 16, 16, 16, 3
+ xxlandc 48, 48, 34
+ vadduwm 14, 16, 14
+ xxlxor 40, 46, 47
+ vrlw 8, 8, 4
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 48
+ xxsldwi 38, 38, 38, 1
+ vperm 7, 7, 7, 0
+ vadduwm 6, 6, 9
+ xxlandc 39, 39, 37
+ vadduwm 14, 7, 14
+ xxswapd 39, 39
+ xxlxor 40, 46, 40
+ xxsldwi 41, 46, 46, 3
+ vrlw 8, 8, 1
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 39
+ vperm 3, 7, 7, 3
+ vmrghw 7, 12, 13
+ xxlandc 34, 35, 34
+ vperm 7, 7, 28, 11
+ vadduwm 3, 2, 9
+ xxlxor 40, 35, 40
+ vrlw 4, 8, 4
+ vadduwm 6, 6, 7
+ vadduwm 6, 4, 6
+ xxlxor 34, 38, 34
+ xxsldwi 0, 38, 38, 3
+ vperm 2, 2, 2, 0
+ xxlandc 34, 34, 37
+ vadduwm 3, 2, 3
+ xxswapd 34, 34
+ xxlxor 36, 35, 36
+ xxsldwi 1, 35, 35, 1
+ vrlw 4, 4, 1
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 1, 36, 34
+ stxvd2x 0, 0, 3
+ xxswapd 1, 1
+ stxvd2x 1, 3, 5
+ li 3, -16
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end0:
+ .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-.Lfunc_begin0
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI1_0:
+ .byte 31
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 30
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 29
+ .byte 6
+ .byte 5
+ .byte 4
+ .byte 28
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI1_2:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI1_3:
+ .long 1779033703
+ .long 3144134277
+ .long 1013904242
+ .long 2773480762
+.LCPI1_4:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_5:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI1_6:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+.LCPI1_7:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_8:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_9:
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_10:
+ .byte 19
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 31
+ .byte 31
+ .byte 31
+.LCPI1_11:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+.LCPI1_12:
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+.LCPI1_13:
+ .byte 27
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 11
+ .byte 10
+ .byte 9
+ .byte 8
+ .byte 15
+ .byte 14
+ .byte 13
+ .byte 12
+ .byte 31
+ .byte 30
+ .byte 29
+ .byte 28
+.LCPI1_14:
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .byte 3
+ .byte 2
+ .byte 1
+ .byte 0
+ .text
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+.Lfunc_begin1:
+ .cfi_startproc
+.Lfunc_gep1:
+ addis 2, 12, .TOC.-.Lfunc_gep1@ha
+ addi 2, 2, .TOC.-.Lfunc_gep1@l
+.Lfunc_lep1:
+ .localentry zfs_blake3_compress_xof_sse41, .Lfunc_lep1-.Lfunc_gep1
+ li 9, -64
+ mtvsrd 34, 5
+ li 5, 16
+ lfdx 0, 0, 4
+ vspltisw 13, -16
+ addis 11, 2, .LCPI1_9@toc@ha
+ stxvd2x 60, 1, 9
+ li 9, -48
+ mtvsrd 35, 7
+ lfd 1, 8(4)
+ lfd 2, 16(4)
+ addis 7, 2, .LCPI1_0@toc@ha
+ stxvd2x 61, 1, 9
+ li 9, -32
+ mtvsrwz 36, 6
+ rldicl 6, 6, 32, 32
+ stxvd2x 62, 1, 9
+ li 9, -16
+ vmrghb 2, 3, 2
+ stxvd2x 63, 1, 9
+ mtvsrwz 35, 6
+ addi 6, 7, .LCPI1_0@toc@l
+ addis 7, 2, .LCPI1_2@toc@ha
+ lfd 3, 24(4)
+ xxmrghd 37, 1, 0
+ lvx 6, 0, 6
+ xxlxor 33, 33, 33
+ lxvd2x 0, 0, 3
+ addis 6, 2, .LCPI1_1@toc@ha
+ addi 7, 7, .LCPI1_2@toc@l
+ vmrghw 3, 3, 4
+ lxvd2x 1, 3, 5
+ addi 6, 6, .LCPI1_1@toc@l
+ vspltisw 14, 9
+ xxmrghd 32, 3, 2
+ xxswapd 36, 0
+ vperm 2, 1, 2, 6
+ xxswapd 38, 1
+ vpkudum 9, 0, 5
+ xxmrgld 34, 34, 35
+ lvx 3, 0, 7
+ addis 7, 2, .LCPI1_4@toc@ha
+ addi 7, 7, .LCPI1_4@toc@l
+ vadduwm 4, 9, 4
+ lvx 11, 0, 7
+ addis 7, 2, .LCPI1_6@toc@ha
+ addi 7, 7, .LCPI1_6@toc@l
+ vadduwm 7, 4, 6
+ lvx 4, 0, 6
+ addis 6, 2, .LCPI1_3@toc@ha
+ addi 6, 6, .LCPI1_3@toc@l
+ vperm 11, 0, 5, 11
+ lvx 0, 0, 7
+ li 7, 32
+ xxlxor 40, 39, 34
+ lvx 10, 0, 6
+ addis 6, 2, .LCPI1_5@toc@ha
+ lxvd2x 0, 4, 7
+ vcmpgtsb 2, 1, 4
+ addi 6, 6, .LCPI1_5@toc@l
+ vperm 4, 8, 8, 3
+ vspltisw 8, 10
+ xxlandc 44, 36, 34
+ vadduwm 4, 8, 8
+ vadduwm 8, 12, 10
+ xxlxor 37, 40, 38
+ vrlw 6, 5, 4
+ vadduwm 5, 7, 11
+ vadduwm 7, 6, 5
+ lvx 5, 0, 6
+ li 6, 48
+ lxvd2x 1, 4, 6
+ addis 4, 2, .LCPI1_7@toc@ha
+ xxlxor 42, 39, 44
+ addi 4, 4, .LCPI1_7@toc@l
+ vcmpgtsb 5, 1, 5
+ vperm 1, 10, 10, 0
+ xxswapd 42, 0
+ xxswapd 44, 1
+ vpkudum 16, 12, 10
+ xxlandc 47, 33, 37
+ vsubuwm 1, 14, 13
+ lvx 14, 0, 4
+ addis 4, 2, .LCPI1_8@toc@ha
+ vadduwm 8, 15, 8
+ xxswapd 45, 47
+ addi 4, 4, .LCPI1_8@toc@l
+ xxlxor 38, 40, 38
+ xxsldwi 40, 40, 40, 3
+ vadduwm 7, 7, 16
+ xxsldwi 48, 48, 48, 1
+ vrlw 6, 6, 1
+ xxsldwi 39, 39, 39, 1
+ vperm 14, 10, 12, 14
+ vadduwm 7, 6, 7
+ xxlxor 45, 39, 45
+ vperm 13, 13, 13, 3
+ xxlandc 45, 45, 34
+ vadduwm 8, 13, 8
+ xxlxor 38, 40, 38
+ vrlw 10, 6, 4
+ vadduwm 6, 7, 14
+ vadduwm 7, 10, 6
+ xxlxor 38, 39, 45
+ vperm 12, 6, 6, 0
+ lvx 6, 0, 4
+ addis 4, 2, .LCPI1_10@toc@ha
+ addi 4, 4, .LCPI1_10@toc@l
+ vperm 13, 11, 9, 6
+ xxlandc 44, 44, 37
+ vadduwm 15, 12, 8
+ vadduwm 7, 7, 13
+ xxsldwi 45, 45, 45, 3
+ xxlxor 40, 47, 42
+ xxsldwi 47, 47, 47, 1
+ xxsldwi 39, 39, 39, 3
+ vrlw 10, 8, 1
+ xxswapd 40, 44
+ vadduwm 17, 10, 7
+ lvx 7, 0, 4
+ addi 4, 11, .LCPI1_9@toc@l
+ xxlxor 44, 49, 40
+ lvx 8, 0, 4
+ addis 4, 2, .LCPI1_11@toc@ha
+ vperm 18, 9, 9, 7
+ addi 4, 4, .LCPI1_11@toc@l
+ vperm 12, 12, 12, 3
+ lvx 9, 0, 4
+ addis 4, 2, .LCPI1_12@toc@ha
+ vperm 19, 14, 16, 8
+ addi 4, 4, .LCPI1_12@toc@l
+ xxlandc 63, 44, 34
+ vperm 12, 19, 18, 9
+ vadduwm 15, 31, 15
+ xxlxor 42, 47, 42
+ vrlw 18, 10, 4
+ vadduwm 10, 17, 12
+ vadduwm 17, 18, 10
+ xxlxor 42, 49, 63
+ xxmrgld 63, 43, 46
+ xxsldwi 49, 49, 49, 1
+ vmrghw 14, 14, 11
+ vperm 19, 10, 10, 0
+ lvx 10, 0, 4
+ addis 4, 2, .LCPI1_13@toc@ha
+ addi 4, 4, .LCPI1_13@toc@l
+ lvx 11, 0, 4
+ addis 4, 2, .LCPI1_14@toc@ha
+ vperm 31, 16, 31, 10
+ addi 4, 4, .LCPI1_14@toc@l
+ vperm 14, 14, 16, 11
+ xxlandc 51, 51, 37
+ vadduwm 15, 19, 15
+ xxswapd 51, 51
+ vadduwm 17, 17, 31
+ xxlxor 50, 47, 50
+ xxsldwi 47, 47, 47, 3
+ vperm 30, 14, 31, 8
+ vrlw 18, 18, 1
+ vadduwm 17, 18, 17
+ xxlxor 51, 49, 51
+ vadduwm 17, 17, 14
+ vperm 19, 19, 19, 3
+ xxlandc 51, 51, 34
+ vadduwm 15, 19, 15
+ xxlxor 48, 47, 50
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 51
+ vperm 19, 12, 13, 6
+ vperm 18, 18, 18, 0
+ vperm 13, 13, 13, 7
+ vadduwm 17, 17, 19
+ xxlandc 50, 50, 37
+ xxsldwi 49, 49, 49, 3
+ vperm 13, 30, 13, 9
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxmrgld 62, 44, 46
+ vmrghw 12, 14, 12
+ xxlxor 48, 47, 48
+ xxsldwi 47, 47, 47, 1
+ vrlw 16, 16, 1
+ vperm 30, 31, 30, 10
+ vperm 12, 12, 31, 11
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 13
+ vperm 18, 18, 18, 3
+ vperm 31, 12, 30, 8
+ xxlandc 50, 50, 34
+ vadduwm 15, 18, 15
+ xxlxor 48, 47, 48
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ xxsldwi 49, 49, 49, 1
+ vperm 18, 18, 18, 0
+ vadduwm 17, 17, 30
+ xxlandc 50, 50, 37
+ vadduwm 15, 18, 15
+ xxswapd 50, 50
+ xxlxor 48, 47, 48
+ xxsldwi 46, 47, 47, 3
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 50, 49, 50
+ vadduwm 17, 17, 12
+ vperm 18, 18, 18, 3
+ xxlandc 47, 50, 34
+ xxsldwi 50, 51, 51, 3
+ vadduwm 14, 15, 14
+ vperm 19, 13, 18, 6
+ xxlxor 48, 46, 48
+ vperm 18, 18, 18, 7
+ vrlw 16, 16, 4
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vadduwm 17, 17, 19
+ vperm 15, 15, 15, 0
+ xxsldwi 49, 49, 49, 3
+ xxlandc 47, 47, 37
+ vadduwm 14, 15, 14
+ xxswapd 47, 47
+ xxlxor 48, 46, 48
+ xxsldwi 46, 46, 46, 1
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 29, 15, 14
+ vperm 14, 31, 18, 9
+ xxmrgld 50, 45, 44
+ xxlxor 48, 61, 48
+ vmrghw 12, 12, 13
+ vrlw 16, 16, 4
+ vperm 18, 30, 18, 10
+ vadduwm 17, 17, 14
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ xxsldwi 49, 49, 49, 1
+ vperm 15, 15, 15, 0
+ vadduwm 17, 17, 18
+ xxlandc 47, 47, 37
+ vadduwm 31, 15, 29
+ xxswapd 47, 47
+ xxlxor 48, 63, 48
+ xxsldwi 45, 63, 63, 3
+ vperm 31, 12, 30, 11
+ vrlw 16, 16, 1
+ vadduwm 17, 16, 17
+ xxlxor 47, 49, 47
+ vperm 15, 15, 15, 3
+ xxlandc 47, 47, 34
+ vadduwm 13, 15, 13
+ xxlxor 44, 45, 48
+ vadduwm 16, 17, 31
+ xxsldwi 49, 51, 51, 3
+ vrlw 12, 12, 4
+ vperm 19, 14, 17, 6
+ vadduwm 16, 12, 16
+ xxlxor 47, 48, 47
+ vperm 15, 15, 15, 0
+ xxlandc 47, 47, 37
+ vadduwm 13, 15, 13
+ xxswapd 47, 47
+ xxlxor 44, 45, 44
+ xxsldwi 45, 45, 45, 1
+ vrlw 30, 12, 1
+ vadduwm 12, 16, 19
+ xxsldwi 44, 44, 44, 3
+ vadduwm 16, 30, 12
+ xxlxor 44, 48, 47
+ vperm 15, 17, 17, 7
+ vperm 12, 12, 12, 3
+ vperm 17, 31, 18, 8
+ xxlandc 61, 44, 34
+ vperm 12, 17, 15, 9
+ vadduwm 13, 29, 13
+ xxlxor 47, 45, 62
+ xxmrgld 62, 46, 63
+ vmrghw 14, 31, 14
+ vrlw 15, 15, 4
+ vadduwm 16, 16, 12
+ vperm 30, 18, 30, 10
+ vperm 14, 14, 18, 11
+ xxsldwi 50, 51, 51, 3
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 61
+ xxsldwi 48, 48, 48, 1
+ vperm 19, 12, 18, 6
+ vperm 17, 17, 17, 0
+ vadduwm 16, 16, 30
+ xxmrgld 60, 44, 46
+ vmrghw 12, 14, 12
+ vperm 28, 30, 28, 10
+ xxlandc 49, 49, 37
+ vadduwm 13, 17, 13
+ xxswapd 49, 49
+ vperm 12, 12, 30, 11
+ xxlxor 47, 45, 47
+ xxsldwi 45, 45, 45, 3
+ vrlw 15, 15, 1
+ vperm 8, 12, 28, 8
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vadduwm 16, 16, 14
+ vperm 17, 17, 17, 3
+ xxlandc 49, 49, 34
+ vadduwm 13, 17, 13
+ xxlxor 47, 45, 47
+ vrlw 15, 15, 4
+ vadduwm 16, 15, 16
+ xxlxor 49, 48, 49
+ vperm 17, 17, 17, 0
+ xxlandc 49, 49, 37
+ vadduwm 31, 17, 13
+ xxlxor 45, 63, 47
+ vrlw 15, 13, 1
+ vadduwm 13, 16, 19
+ xxswapd 48, 49
+ xxsldwi 51, 51, 51, 3
+ xxsldwi 45, 45, 45, 3
+ vadduwm 17, 15, 13
+ xxlxor 45, 49, 48
+ lvx 16, 0, 4
+ vperm 29, 13, 13, 3
+ vperm 13, 18, 18, 7
+ xxsldwi 50, 63, 63, 1
+ vperm 16, 14, 30, 16
+ vperm 7, 19, 19, 7
+ xxlandc 63, 61, 34
+ vadduwm 18, 31, 18
+ vperm 29, 16, 13, 9
+ xxlxor 47, 50, 47
+ vperm 6, 16, 19, 6
+ vrlw 15, 15, 4
+ vperm 7, 8, 7, 9
+ vadduwm 17, 17, 29
+ xxmrgld 41, 61, 44
+ vadduwm 17, 15, 17
+ vperm 9, 28, 9, 10
+ xxlxor 63, 49, 63
+ xxsldwi 49, 49, 49, 1
+ vperm 31, 31, 31, 0
+ vadduwm 17, 17, 28
+ xxlandc 63, 63, 37
+ vadduwm 18, 31, 18
+ xxswapd 63, 63
+ xxlxor 47, 50, 47
+ xxsldwi 46, 50, 50, 3
+ vrlw 15, 15, 1
+ vadduwm 17, 15, 17
+ xxlxor 63, 49, 63
+ vadduwm 17, 17, 12
+ vperm 31, 31, 31, 3
+ xxlandc 50, 63, 34
+ vadduwm 14, 18, 14
+ xxlxor 47, 46, 47
+ vrlw 15, 15, 4
+ vadduwm 17, 15, 17
+ xxlxor 50, 49, 50
+ vadduwm 6, 17, 6
+ vperm 18, 18, 18, 0
+ xxsldwi 38, 38, 38, 3
+ xxlandc 50, 50, 37
+ vadduwm 14, 18, 14
+ xxswapd 48, 50
+ xxlxor 47, 46, 47
+ xxsldwi 46, 46, 46, 1
+ vrlw 15, 15, 1
+ vadduwm 6, 15, 6
+ xxlxor 48, 38, 48
+ vadduwm 6, 6, 7
+ vperm 16, 16, 16, 3
+ xxlandc 48, 48, 34
+ vadduwm 14, 16, 14
+ xxlxor 40, 46, 47
+ vrlw 8, 8, 4
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 48
+ xxsldwi 38, 38, 38, 1
+ vperm 7, 7, 7, 0
+ vadduwm 6, 6, 9
+ xxlandc 39, 39, 37
+ vadduwm 14, 7, 14
+ xxswapd 39, 39
+ xxlxor 40, 46, 40
+ xxsldwi 41, 46, 46, 3
+ vrlw 8, 8, 1
+ vadduwm 6, 8, 6
+ xxlxor 39, 38, 39
+ vperm 3, 7, 7, 3
+ vmrghw 7, 12, 13
+ xxlandc 34, 35, 34
+ vperm 7, 7, 28, 11
+ vadduwm 3, 2, 9
+ xxlxor 40, 35, 40
+ vrlw 4, 8, 4
+ vadduwm 6, 6, 7
+ vadduwm 6, 4, 6
+ xxlxor 34, 38, 34
+ xxsldwi 0, 38, 38, 3
+ vperm 2, 2, 2, 0
+ xxlandc 34, 34, 37
+ vadduwm 3, 2, 3
+ xxswapd 34, 34
+ xxlxor 36, 35, 36
+ xxsldwi 1, 35, 35, 1
+ vrlw 4, 4, 1
+ xxlxor 0, 1, 0
+ xxswapd 0, 0
+ xxlxor 2, 36, 34
+ stxvd2x 0, 0, 8
+ xxswapd 2, 2
+ stxvd2x 2, 8, 5
+ lfdx 0, 0, 3
+ lfd 2, 8(3)
+ xxmrghd 35, 2, 0
+ xxlxor 0, 1, 35
+ xxswapd 0, 0
+ stxvd2x 0, 8, 7
+ lfd 0, 16(3)
+ lfd 1, 24(3)
+ li 3, -16
+ xxmrghd 35, 1, 0
+ xxlxor 0, 34, 35
+ xxswapd 0, 0
+ stxvd2x 0, 8, 6
+ lxvd2x 63, 1, 3
+ li 3, -32
+ lxvd2x 62, 1, 3
+ li 3, -48
+ lxvd2x 61, 1, 3
+ li 3, -64
+ lxvd2x 60, 1, 3
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end1:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-.Lfunc_begin1
+ .cfi_endproc
+
+ .globl zfs_blake3_hash_many_sse41
+ .p2align 2
+ .type zfs_blake3_hash_many_sse41,@function
+zfs_blake3_hash_many_sse41:
+.Lfunc_begin2:
+ .cfi_startproc
+.Lfunc_gep2:
+ addis 2, 12, .TOC.-.Lfunc_gep2@ha
+ addi 2, 2, .TOC.-.Lfunc_gep2@l
+.Lfunc_lep2:
+ .localentry zfs_blake3_hash_many_sse41, .Lfunc_lep2-.Lfunc_gep2
+ mfocrf 12, 32
+ mflr 0
+ std 0, 16(1)
+ stw 12, 8(1)
+ stdu 1, -256(1)
+ .cfi_def_cfa_offset 256
+ .cfi_offset lr, 16
+ .cfi_offset r17, -120
+ .cfi_offset r18, -112
+ .cfi_offset r19, -104
+ .cfi_offset r20, -96
+ .cfi_offset r21, -88
+ .cfi_offset r22, -80
+ .cfi_offset r23, -72
+ .cfi_offset r24, -64
+ .cfi_offset r25, -56
+ .cfi_offset r26, -48
+ .cfi_offset r27, -40
+ .cfi_offset r28, -32
+ .cfi_offset r29, -24
+ .cfi_offset r30, -16
+ .cfi_offset cr2, 8
+ std 26, 208(1)
+ mr 26, 4
+ cmpldi 1, 4, 4
+ andi. 4, 8, 1
+ std 18, 144(1)
+ std 19, 152(1)
+ crmove 8, 1
+ ld 19, 360(1)
+ lwz 18, 352(1)
+ std 24, 192(1)
+ std 25, 200(1)
+ std 27, 216(1)
+ std 28, 224(1)
+ mr 24, 10
+ mr 28, 6
+ mr 27, 5
+ mr 25, 3
+ std 29, 232(1)
+ std 30, 240(1)
+ mr 30, 9
+ mr 29, 7
+ std 17, 136(1)
+ std 20, 160(1)
+ std 21, 168(1)
+ std 22, 176(1)
+ std 23, 184(1)
+ blt 1, .LBB2_3
+ li 3, 0
+ li 4, 1
+ clrldi 23, 30, 32
+ isel 22, 4, 3, 8
+ clrldi 21, 24, 32
+ clrldi 20, 18, 32
+.LBB2_2:
+ mr 3, 25
+ mr 4, 27
+ mr 5, 28
+ mr 6, 29
+ mr 7, 22
+ mr 8, 23
+ mr 9, 21
+ mr 10, 20
+ std 19, 32(1)
+ bl blake3_hash4_sse41
+ addi 26, 26, -4
+ addi 3, 29, 4
+ addi 25, 25, 32
+ addi 19, 19, 128
+ cmpldi 26, 3
+ isel 29, 3, 29, 8
+ bgt 0, .LBB2_2
+.LBB2_3:
+ cmpldi 26, 0
+ beq 0, .LBB2_11
+ li 3, 0
+ li 4, 1
+ or 21, 24, 30
+ li 20, 16
+ addi 24, 1, 96
+ isel 22, 4, 3, 8
+.LBB2_5:
+ lxvd2x 0, 28, 20
+ ld 23, 0(25)
+ mr 17, 27
+ mr 3, 21
+ stxvd2x 0, 24, 20
+ lxvd2x 0, 0, 28
+ stxvd2x 0, 0, 24
+.LBB2_6:
+ cmpldi 17, 1
+ beq 0, .LBB2_8
+ cmpldi 17, 0
+ bne 0, .LBB2_9
+ b .LBB2_10
+.LBB2_8:
+ or 3, 3, 18
+.LBB2_9:
+ clrldi 7, 3, 56
+ mr 3, 24
+ mr 4, 23
+ li 5, 64
+ mr 6, 29
+ bl zfs_blake3_compress_in_place_sse41
+ addi 23, 23, 64
+ addi 17, 17, -1
+ mr 3, 30
+ b .LBB2_6
+.LBB2_10:
+ lxvd2x 0, 24, 20
+ addi 26, 26, -1
+ add 29, 29, 22
+ addi 25, 25, 8
+ cmpldi 26, 0
+ stxvd2x 0, 19, 20
+ lxvd2x 0, 0, 24
+ stxvd2x 0, 0, 19
+ addi 19, 19, 32
+ bne 0, .LBB2_5
+.LBB2_11:
+ ld 30, 240(1)
+ ld 29, 232(1)
+ ld 28, 224(1)
+ ld 27, 216(1)
+ ld 26, 208(1)
+ ld 25, 200(1)
+ ld 24, 192(1)
+ ld 23, 184(1)
+ ld 22, 176(1)
+ ld 21, 168(1)
+ ld 20, 160(1)
+ ld 19, 152(1)
+ ld 18, 144(1)
+ ld 17, 136(1)
+ addi 1, 1, 256
+ ld 0, 16(1)
+ lwz 12, 8(1)
+ mtocrf 32, 12
+ mtlr 0
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end2:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end2-.Lfunc_begin2
+ .cfi_endproc
+
+ .section .rodata.cst16,"aM",@progbits,16
+ .p2align 4
+.LCPI3_0:
+ .quad 4294967296
+ .quad 12884901890
+.LCPI3_1:
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 1
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 5
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 9
+ .byte 14
+ .byte 15
+ .byte 12
+ .byte 13
+.LCPI3_2:
+ .byte 1
+ .byte 2
+ .byte 3
+ .byte 0
+ .byte 5
+ .byte 6
+ .byte 7
+ .byte 4
+ .byte 9
+ .byte 10
+ .byte 11
+ .byte 8
+ .byte 13
+ .byte 14
+ .byte 15
+ .byte 12
+.LCPI3_3:
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 30
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 26
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 22
+ .byte 17
+ .byte 16
+ .byte 19
+ .byte 18
+.LCPI3_4:
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+ .long 1779033703
+.LCPI3_5:
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+ .long 3144134277
+.LCPI3_6:
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+ .long 1013904242
+.LCPI3_7:
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+ .long 2773480762
+.LCPI3_8:
+ .byte 30
+ .byte 29
+ .byte 28
+ .byte 31
+ .byte 26
+ .byte 25
+ .byte 24
+ .byte 27
+ .byte 22
+ .byte 21
+ .byte 20
+ .byte 23
+ .byte 18
+ .byte 17
+ .byte 16
+ .byte 19
+ .text
+ .p2align 2
+ .type blake3_hash4_sse41,@function
+blake3_hash4_sse41:
+.Lfunc_begin3:
+ .cfi_startproc
+.Lfunc_gep3:
+ addis 2, 12, .TOC.-.Lfunc_gep3@ha
+ addi 2, 2, .TOC.-.Lfunc_gep3@l
+.Lfunc_lep3:
+ .localentry blake3_hash4_sse41, .Lfunc_lep3-.Lfunc_gep3
+ stdu 1, -416(1)
+ .cfi_def_cfa_offset 416
+ .cfi_offset r22, -176
+ .cfi_offset r23, -168
+ .cfi_offset r24, -160
+ .cfi_offset r25, -152
+ .cfi_offset r26, -144
+ .cfi_offset r27, -136
+ .cfi_offset r28, -128
+ .cfi_offset r29, -120
+ .cfi_offset r30, -112
+ .cfi_offset f20, -96
+ .cfi_offset f21, -88
+ .cfi_offset f22, -80
+ .cfi_offset f23, -72
+ .cfi_offset f24, -64
+ .cfi_offset f25, -56
+ .cfi_offset f26, -48
+ .cfi_offset f27, -40
+ .cfi_offset f28, -32
+ .cfi_offset f29, -24
+ .cfi_offset f30, -16
+ .cfi_offset f31, -8
+ .cfi_offset v20, -368
+ .cfi_offset v21, -352
+ .cfi_offset v22, -336
+ .cfi_offset v23, -320
+ .cfi_offset v24, -304
+ .cfi_offset v25, -288
+ .cfi_offset v26, -272
+ .cfi_offset v27, -256
+ .cfi_offset v28, -240
+ .cfi_offset v29, -224
+ .cfi_offset v30, -208
+ .cfi_offset v31, -192
+ li 11, 48
+ li 0, 8
+ std 30, 304(1)
+ li 30, 12
+ li 12, 4
+ lfiwzx 0, 0, 5
+ stxvd2x 52, 1, 11
+ li 11, 64
+ lfiwzx 2, 5, 0
+ li 0, 20
+ lfiwzx 3, 5, 30
+ stxvd2x 53, 1, 11
+ li 11, 80
+ li 30, 24
+ lfiwzx 4, 5, 0
+ li 0, 28
+ stxvd2x 54, 1, 11
+ li 11, 96
+ lfiwzx 1, 5, 12
+ lfiwzx 6, 5, 30
+ xxspltw 47, 0, 1
+ cmpldi 4, 0
+ std 22, 240(1)
+ stxvd2x 55, 1, 11
+ li 11, 112
+ lfiwzx 7, 5, 0
+ xxspltw 40, 2, 1
+ std 23, 248(1)
+ xxspltw 39, 3, 1
+ std 24, 256(1)
+ std 25, 264(1)
+ xxspltw 51, 1, 1
+ xxspltw 43, 6, 1
+ std 26, 272(1)
+ xxspltw 41, 7, 1
+ std 27, 280(1)
+ std 28, 288(1)
+ std 29, 296(1)
+ stxvd2x 56, 1, 11
+ li 11, 128
+ stfd 20, 320(1)
+ stxvd2x 57, 1, 11
+ li 11, 144
+ stfd 21, 328(1)
+ stxvd2x 58, 1, 11
+ li 11, 160
+ stfd 22, 336(1)
+ stxvd2x 59, 1, 11
+ li 11, 176
+ stfd 23, 344(1)
+ stxvd2x 60, 1, 11
+ li 11, 192
+ stfd 24, 352(1)
+ stxvd2x 61, 1, 11
+ li 11, 208
+ stfd 25, 360(1)
+ stxvd2x 62, 1, 11
+ li 11, 224
+ stfd 26, 368(1)
+ stxvd2x 63, 1, 11
+ li 11, 16
+ xxspltw 63, 4, 1
+ lfiwzx 5, 5, 11
+ ld 5, 448(1)
+ stfd 27, 376(1)
+ stfd 28, 384(1)
+ stfd 29, 392(1)
+ stfd 30, 400(1)
+ stfd 31, 408(1)
+ xxspltw 50, 5, 1
+ beq 0, .LBB3_5
+ addis 30, 2, .LCPI3_0@toc@ha
+ neg 7, 7
+ xxleqv 34, 34, 34
+ addis 28, 2, .LCPI3_5@toc@ha
+ addis 27, 2, .LCPI3_6@toc@ha
+ addis 26, 2, .LCPI3_7@toc@ha
+ addis 29, 2, .LCPI3_4@toc@ha
+ addis 25, 2, .LCPI3_8@toc@ha
+ addi 0, 30, .LCPI3_0@toc@l
+ mtfprwz 2, 7
+ addis 7, 2, .LCPI3_1@toc@ha
+ addis 30, 2, .LCPI3_3@toc@ha
+ addi 24, 29, .LCPI3_4@toc@l
+ ld 29, 24(3)
+ lxvd2x 1, 0, 0
+ mtfprwz 0, 6
+ rldicl 6, 6, 32, 32
+ addi 0, 30, .LCPI3_3@toc@l
+ ld 30, 16(3)
+ xxspltw 2, 2, 1
+ vslw 2, 2, 2
+ xxspltw 37, 0, 1
+ mtfprwz 0, 6
+ addi 6, 7, .LCPI3_1@toc@l
+ addis 7, 2, .LCPI3_2@toc@ha
+ xxswapd 35, 1
+ xxlxor 36, 36, 36
+ xxspltw 33, 0, 1
+ xxland 35, 2, 35
+ vadduwm 0, 3, 5
+ lvx 5, 0, 6
+ addi 6, 7, .LCPI3_2@toc@l
+ ld 7, 8(3)
+ xxlor 35, 35, 34
+ xxlxor 34, 32, 34
+ xxlor 9, 32, 32
+ lvx 0, 0, 6
+ ld 6, 0(3)
+ addi 3, 3, -8
+ vcmpgtsw 2, 3, 2
+ lvx 3, 0, 0
+ addi 0, 28, .LCPI3_5@toc@l
+ addi 28, 27, .LCPI3_6@toc@l
+ addi 27, 26, .LCPI3_7@toc@l
+ addi 26, 25, .LCPI3_8@toc@l
+ or 25, 9, 8
+ li 9, 0
+ vcmpgtsb 5, 4, 5
+ vcmpgtsb 0, 4, 0
+ xxlor 11, 35, 35
+ lvx 3, 0, 24
+ xxlor 12, 35, 35
+ vsubuwm 2, 1, 2
+ xxlnor 10, 37, 37
+ xxlor 13, 34, 34
+ lvx 2, 0, 0
+ li 0, 32
+ xxlnor 31, 32, 32
+ xxlor 30, 34, 34
+ lvx 2, 0, 28
+ li 28, 48
+ xxlor 29, 34, 34
+ lvx 2, 0, 27
+ li 27, 0
+ xxlor 28, 34, 34
+ lvx 2, 0, 26
+ xxlor 27, 34, 34
+.LBB3_2:
+ mr 26, 27
+ addi 27, 27, 1
+ xxlor 23, 39, 39
+ cmpld 27, 4
+ sldi 26, 26, 6
+ xxlor 24, 40, 40
+ iseleq 24, 10, 9
+ add 23, 6, 26
+ add 22, 30, 26
+ lxvd2x 0, 6, 26
+ lxvd2x 1, 7, 26
+ or 25, 24, 25
+ add 24, 7, 26
+ lxvd2x 2, 30, 26
+ lxvd2x 3, 29, 26
+ xxlor 26, 47, 47
+ lxvd2x 4, 23, 11
+ lxvd2x 6, 24, 11
+ clrlwi 25, 25, 24
+ xxlor 25, 51, 51
+ lxvd2x 7, 22, 11
+ lxvd2x 8, 23, 0
+ mtfprd 5, 25
+ add 25, 29, 26
+ xxswapd 34, 0
+ lxvd2x 0, 25, 11
+ xxswapd 38, 1
+ xxswapd 32, 2
+ lxvd2x 1, 24, 0
+ lxvd2x 2, 22, 0
+ xxswapd 40, 3
+ xxswapd 39, 4
+ lxvd2x 3, 25, 0
+ lxvd2x 4, 23, 28
+ xxswapd 60, 6
+ xxswapd 47, 7
+ lxvd2x 6, 24, 28
+ xxswapd 57, 8
+ lxvd2x 7, 22, 28
+ lxvd2x 8, 25, 28
+ xxswapd 58, 0
+ mr 25, 3
+ xxswapd 53, 1
+ xxswapd 56, 2
+ xxswapd 52, 3
+ xxswapd 55, 4
+ xxswapd 54, 6
+ xxswapd 0, 5
+ xxswapd 42, 7
+ xxswapd 48, 8
+ mtctr 12
+.LBB3_3:
+ ldu 24, 8(25)
+ add 24, 24, 26
+ addi 24, 24, 256
+ dcbt 0, 24
+ bdnz .LBB3_3
+ vmrgew 4, 28, 7
+ vspltisw 14, 9
+ mr 25, 8
+ vmrgew 27, 6, 2
+ vspltisw 17, 4
+ vmrglw 12, 6, 2
+ vspltisw 19, 10
+ vmrghw 30, 6, 2
+ xxspltw 0, 0, 3
+ vmrglw 2, 8, 0
+ vmrghw 13, 8, 0
+ xxlor 7, 36, 36
+ vmrgew 4, 21, 25
+ vmrglw 29, 28, 7
+ vmrghw 1, 28, 7
+ vmrglw 28, 26, 15
+ xxmrgld 37, 34, 44
+ vmrgew 7, 26, 15
+ vmrghw 15, 26, 15
+ xxlor 21, 36, 36
+ vmrglw 4, 21, 25
+ vmrghw 21, 21, 25
+ vmrglw 25, 20, 24
+ xxmrgld 34, 60, 61
+ vmrghw 26, 20, 24
+ xxlor 38, 26, 26
+ vmrgew 3, 8, 0
+ xxlor 5, 36, 36
+ vmrgew 4, 20, 24
+ vspltisw 24, -16
+ vmrglw 20, 22, 23
+ xxmrgld 57, 57, 5
+ vmrglw 8, 16, 10
+ vmrghw 0, 16, 10
+ vadduwm 12, 19, 19
+ xxlor 8, 37, 37
+ xxlor 20, 36, 36
+ vmrgew 4, 22, 23
+ vmrghw 23, 22, 23
+ xxmrgld 40, 40, 52
+ vmrgew 22, 16, 10
+ vsubuwm 10, 14, 24
+ vslw 14, 17, 17
+ vadduwm 17, 5, 6
+ xxmrgld 37, 47, 33
+ xxlor 22, 36, 36
+ xxmrgld 36, 45, 62
+ xxlor 38, 25, 25
+ xxlor 2, 34, 34
+ vadduwm 19, 4, 6
+ xxmrgld 38, 39, 7
+ xxlor 3, 36, 36
+ xxmrghd 39, 47, 33
+ xxlor 36, 24, 24
+ xxmrgld 33, 58, 53
+ vadduwm 17, 17, 18
+ vadduwm 29, 2, 4
+ xxmrgld 36, 35, 59
+ xxlor 34, 23, 23
+ xxmrghd 35, 45, 62
+ xxlor 1, 9, 9
+ vadduwm 28, 5, 2
+ xxlor 1, 13, 13
+ vadduwm 19, 19, 31
+ vadduwm 24, 29, 11
+ vadduwm 28, 28, 9
+ xxlxor 61, 49, 9
+ xxlor 1, 41, 41
+ xxlor 41, 11, 11
+ xxlxor 34, 51, 13
+ vperm 29, 29, 29, 9
+ xxlxor 46, 56, 46
+ vperm 2, 2, 2, 9
+ xxlxor 59, 60, 0
+ vperm 14, 14, 14, 9
+ vperm 30, 27, 27, 9
+ vadduwm 19, 19, 3
+ xxlor 4, 35, 35
+ xxland 61, 61, 10
+ xxlor 35, 12, 12
+ xxland 34, 34, 10
+ vadduwm 27, 29, 3
+ xxlor 35, 30, 30
+ vadduwm 17, 17, 4
+ xxlor 26, 36, 36
+ xxland 46, 46, 10
+ vadduwm 3, 2, 3
+ xxlor 36, 29, 29
+ xxland 62, 62, 10
+ xxlxor 45, 59, 50
+ xxlxor 50, 35, 63
+ vadduwm 31, 14, 4
+ xxlor 36, 28, 28
+ xxlor 6, 37, 37
+ vadduwm 16, 30, 4
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 1
+ vrlw 4, 13, 12
+ vrlw 18, 18, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 15, 24, 6
+ vadduwm 28, 28, 7
+ vadduwm 17, 4, 17
+ vadduwm 19, 18, 19
+ vadduwm 15, 11, 15
+ vadduwm 28, 5, 28
+ xxlor 25, 38, 38
+ xxlxor 61, 49, 61
+ xxlxor 34, 51, 34
+ xxlxor 46, 47, 46
+ xxlxor 62, 60, 62
+ xxlor 38, 27, 27
+ vadduwm 19, 19, 1
+ vperm 29, 29, 29, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 14, 14, 6
+ vperm 30, 30, 30, 6
+ xxlor 5, 33, 33
+ vadduwm 17, 17, 25
+ xxland 61, 61, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ xxland 62, 62, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 2, 3
+ vadduwm 31, 24, 31
+ vadduwm 16, 30, 16
+ xxlxor 36, 59, 36
+ xxlxor 50, 35, 50
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 18, 10
+ xxmrgld 50, 32, 55
+ vrlw 11, 11, 10
+ xxmrghd 55, 32, 55
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 15, 15, 8
+ vadduwm 28, 28, 18
+ vadduwm 17, 1, 17
+ vadduwm 19, 11, 19
+ vadduwm 15, 5, 15
+ vadduwm 28, 4, 28
+ xxlor 7, 57, 57
+ xxlxor 62, 49, 62
+ xxlxor 61, 51, 61
+ xxlxor 57, 47, 34
+ xxlxor 34, 60, 56
+ vperm 24, 30, 30, 9
+ xxmrgld 62, 20, 21
+ vperm 29, 29, 29, 9
+ vperm 25, 25, 25, 9
+ vperm 2, 2, 2, 9
+ vmr 14, 8
+ xxmrghd 40, 58, 53
+ xxmrgld 58, 54, 22
+ vadduwm 17, 17, 30
+ xxland 56, 56, 10
+ vadduwm 21, 19, 8
+ xxland 61, 61, 10
+ xxland 51, 57, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vadduwm 0, 15, 26
+ vadduwm 15, 28, 23
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 21
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vmr 13, 8
+ xxlor 53, 3, 3
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 52, 4, 4
+ xxlor 40, 2, 2
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 8
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ vmr 25, 26
+ xxlor 3, 39, 39
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 54, 6, 6
+ xxlor 58, 5, 5
+ xxlor 39, 8, 8
+ vadduwm 17, 17, 22
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 39, 26, 26
+ vadduwm 28, 28, 14
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 7
+ vadduwm 0, 0, 30
+ vadduwm 15, 15, 23
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 24, 55, 55
+ vadduwm 17, 17, 13
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vmr 23, 13
+ xxlor 45, 25, 25
+ xxlor 39, 7, 7
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 7
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 2, 46, 46
+ xxlor 46, 3, 3
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vadduwm 17, 17, 20
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 25
+ vadduwm 15, 15, 14
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 52, 2, 2
+ vadduwm 17, 17, 8
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 21
+ vadduwm 15, 15, 18
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ vadduwm 17, 17, 22
+ vadduwm 28, 28, 30
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 0, 0, 23
+ vadduwm 15, 15, 7
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 5, 4, 4
+ xxlor 4, 58, 58
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 39, 8, 8
+ xxlor 54, 24, 24
+ xxlor 58, 26, 26
+ vadduwm 17, 17, 13
+ vadduwm 28, 28, 7
+ vadduwm 0, 0, 22
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 3, 53, 53
+ xxlor 53, 4, 4
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 20
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 2, 55, 55
+ vmr 23, 18
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 50, 5, 5
+ vadduwm 17, 17, 14
+ vadduwm 28, 28, 30
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 22
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 25, 40, 40
+ vmr 8, 13
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ xxlor 45, 25, 25
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 13
+ xxlor 45, 2, 2
+ vadduwm 0, 0, 8
+ vadduwm 28, 28, 13
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 4, 57, 57
+ xxlor 26, 46, 46
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 8, 62, 62
+ xxlor 57, 3, 3
+ xxlor 46, 7, 7
+ xxlor 62, 6, 6
+ vadduwm 17, 17, 7
+ vadduwm 28, 28, 25
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 30
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vadduwm 17, 17, 20
+ xxlor 3, 52, 52
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 52, 8, 8
+ vadduwm 0, 0, 22
+ vadduwm 28, 28, 20
+ vadduwm 15, 15, 23
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 6, 55, 55
+ xxlor 55, 4, 4
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 17, 17, 23
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 21
+ vadduwm 15, 15, 14
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 4, 53, 53
+ xxlor 53, 26, 26
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 17, 17, 21
+ vadduwm 28, 28, 8
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 30
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 5, 25, 25
+ xxlor 2, 58, 58
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vmr 22, 26
+ vadduwm 0, 0, 26
+ xxlor 58, 5, 5
+ vadduwm 17, 17, 25
+ vadduwm 28, 28, 18
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 7, 24, 24
+ xxlor 8, 57, 57
+ xxland 56, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 59, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 57, 7, 7
+ vadduwm 17, 17, 20
+ vadduwm 28, 28, 13
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 25
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 24, 24, 24, 9
+ xxlor 5, 52, 52
+ xxlor 23, 45, 45
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 56, 56, 10
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ xxlor 52, 6, 6
+ vadduwm 28, 28, 8
+ vmr 13, 8
+ xxlor 40, 3, 3
+ vadduwm 17, 17, 20
+ vadduwm 0, 0, 8
+ vadduwm 15, 15, 22
+ vadduwm 17, 4, 17
+ vadduwm 28, 1, 28
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 49, 61
+ xxlxor 51, 60, 51
+ xxlxor 34, 32, 34
+ xxlxor 56, 47, 56
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 24, 24, 24, 6
+ xxlor 25, 39, 39
+ vmr 7, 30
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 56, 56, 31
+ vadduwm 27, 29, 27
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 24, 16
+ xxlxor 36, 59, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vmr 30, 18
+ xxlor 24, 46, 46
+ xxlor 46, 25, 25
+ xxlor 50, 8, 8
+ vadduwm 17, 17, 23
+ vadduwm 28, 28, 14
+ vadduwm 0, 0, 18
+ vadduwm 15, 15, 26
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 9
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ xxlor 6, 58, 58
+ xxlor 58, 4, 4
+ xxland 56, 56, 10
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ vadduwm 31, 24, 31
+ vadduwm 16, 29, 16
+ vadduwm 27, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 59, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ vadduwm 17, 17, 30
+ vadduwm 28, 28, 26
+ vadduwm 0, 0, 7
+ vadduwm 15, 15, 21
+ vadduwm 17, 1, 17
+ vadduwm 28, 11, 28
+ vadduwm 0, 5, 0
+ vadduwm 15, 4, 15
+ xxlxor 56, 49, 56
+ xxlxor 61, 60, 61
+ xxlxor 51, 32, 51
+ xxlxor 34, 47, 34
+ vperm 24, 24, 24, 6
+ vperm 29, 29, 29, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ xxlor 40, 23, 23
+ vadduwm 13, 28, 13
+ vadduwm 8, 17, 8
+ xxland 49, 56, 31
+ xxland 61, 61, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ vadduwm 31, 17, 31
+ vadduwm 16, 29, 16
+ vadduwm 28, 19, 27
+ vadduwm 3, 2, 3
+ xxlxor 33, 63, 33
+ xxlxor 43, 48, 43
+ xxlxor 36, 35, 36
+ xxlxor 37, 60, 37
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlor 2, 55, 55
+ vmr 23, 30
+ xxlor 62, 24, 24
+ vadduwm 0, 0, 22
+ vadduwm 15, 15, 30
+ vadduwm 8, 4, 8
+ vadduwm 13, 1, 13
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 61, 40, 61
+ xxlxor 51, 45, 51
+ xxlxor 34, 32, 34
+ xxlxor 49, 47, 49
+ vperm 29, 29, 29, 9
+ vperm 19, 19, 19, 9
+ vperm 2, 2, 2, 9
+ vperm 17, 17, 17, 9
+ vadduwm 13, 13, 14
+ xxlor 46, 5, 5
+ xxland 61, 61, 10
+ xxland 51, 51, 10
+ xxland 34, 34, 10
+ xxland 49, 49, 10
+ vadduwm 28, 29, 28
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 17, 16
+ xxlxor 36, 60, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 4, 4, 12
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vadduwm 8, 8, 25
+ vadduwm 0, 0, 14
+ vadduwm 15, 15, 7
+ vadduwm 8, 4, 8
+ vadduwm 13, 1, 13
+ vadduwm 0, 11, 0
+ vadduwm 15, 5, 15
+ xxlxor 62, 40, 61
+ xxlxor 51, 45, 51
+ xxlxor 34, 32, 34
+ xxlxor 49, 47, 49
+ vperm 30, 30, 30, 6
+ vperm 19, 19, 19, 6
+ vperm 2, 2, 2, 6
+ vperm 17, 17, 17, 6
+ vadduwm 29, 8, 20
+ vadduwm 8, 13, 18
+ xxland 45, 62, 31
+ xxland 51, 51, 31
+ xxland 34, 34, 31
+ xxland 49, 49, 31
+ vadduwm 30, 13, 28
+ vadduwm 3, 19, 3
+ vadduwm 31, 2, 31
+ vadduwm 16, 17, 16
+ xxlxor 36, 62, 36
+ xxlxor 33, 35, 33
+ xxlxor 43, 63, 43
+ xxlxor 37, 48, 37
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ vrlw 4, 4, 10
+ vadduwm 0, 0, 23
+ vadduwm 7, 15, 21
+ vadduwm 29, 1, 29
+ vadduwm 8, 11, 8
+ vadduwm 0, 5, 0
+ vadduwm 7, 4, 7
+ xxlxor 47, 61, 49
+ xxlxor 45, 40, 45
+ xxlxor 49, 32, 51
+ xxlxor 34, 39, 34
+ vperm 15, 15, 15, 9
+ vperm 13, 13, 13, 9
+ vperm 17, 17, 17, 9
+ vperm 2, 2, 2, 9
+ xxlor 46, 3, 3
+ vadduwm 9, 29, 26
+ vadduwm 8, 8, 14
+ xxland 46, 47, 10
+ xxland 45, 45, 10
+ xxland 47, 49, 10
+ xxland 34, 34, 10
+ vadduwm 17, 14, 31
+ vadduwm 16, 13, 16
+ vadduwm 18, 15, 30
+ vadduwm 3, 2, 3
+ xxlxor 33, 49, 33
+ xxlxor 43, 48, 43
+ xxlxor 37, 50, 37
+ xxlxor 36, 35, 36
+ vrlw 1, 1, 12
+ vrlw 11, 11, 12
+ vrlw 5, 5, 12
+ vrlw 4, 4, 12
+ xxlor 44, 6, 6
+ xxlor 0, 10, 10
+ vadduwm 0, 0, 12
+ xxlor 44, 2, 2
+ vadduwm 9, 1, 9
+ vadduwm 7, 7, 12
+ vadduwm 8, 11, 8
+ vadduwm 7, 4, 7
+ vadduwm 0, 5, 0
+ xxlxor 34, 39, 34
+ xxlxor 44, 32, 47
+ vperm 2, 2, 2, 6
+ xxlxor 46, 41, 46
+ xxlxor 45, 40, 45
+ vperm 12, 12, 12, 6
+ vperm 14, 14, 14, 6
+ vperm 13, 13, 13, 6
+ xxland 34, 34, 31
+ xxlor 1, 31, 31
+ vadduwm 3, 2, 3
+ xxland 44, 44, 31
+ xxlxor 36, 35, 36
+ xxlxor 51, 35, 40
+ xxland 35, 46, 31
+ xxland 38, 45, 31
+ vadduwm 15, 12, 18
+ vadduwm 8, 3, 17
+ vadduwm 13, 6, 16
+ xxlxor 37, 47, 37
+ xxlxor 33, 40, 33
+ xxlxor 43, 45, 43
+ vrlw 4, 4, 10
+ vrlw 1, 1, 10
+ vrlw 11, 11, 10
+ vrlw 5, 5, 10
+ xxlxor 47, 47, 41
+ xxlxor 40, 40, 32
+ xxlxor 39, 45, 39
+ xxlxor 50, 36, 38
+ xxlxor 63, 33, 44
+ xxlxor 43, 43, 34
+ xxlxor 41, 37, 35
+ bne 0, .LBB3_2
+.LBB3_5:
+ vmrglw 2, 19, 15
+ li 3, 32
+ li 4, 48
+ vmrglw 4, 7, 8
+ vmrglw 0, 31, 18
+ vmrglw 1, 9, 11
+ vmrghw 3, 19, 15
+ vmrghw 5, 7, 8
+ vmrghw 6, 31, 18
+ vmrghw 7, 9, 11
+ xxmrgld 40, 36, 34
+ xxmrghd 34, 36, 34
+ xxmrgld 41, 33, 32
+ xxswapd 0, 40
+ xxmrgld 36, 37, 35
+ xxmrghd 35, 37, 35
+ xxmrghd 37, 33, 32
+ xxswapd 1, 41
+ xxmrgld 32, 39, 38
+ xxmrghd 33, 39, 38
+ xxswapd 2, 34
+ xxswapd 4, 36
+ xxswapd 3, 37
+ stxvd2x 0, 0, 5
+ xxswapd 5, 32
+ stxvd2x 1, 5, 11
+ xxswapd 0, 35
+ xxswapd 1, 33
+ stxvd2x 2, 5, 3
+ li 3, 64
+ stxvd2x 3, 5, 4
+ li 4, 80
+ stxvd2x 4, 5, 3
+ li 3, 96
+ stxvd2x 5, 5, 4
+ li 4, 112
+ stxvd2x 0, 5, 3
+ stxvd2x 1, 5, 4
+ li 3, 224
+ lxvd2x 63, 1, 3
+ li 3, 208
+ lfd 31, 408(1)
+ ld 30, 304(1)
+ ld 29, 296(1)
+ lxvd2x 62, 1, 3
+ li 3, 192
+ lfd 30, 400(1)
+ ld 28, 288(1)
+ ld 27, 280(1)
+ lxvd2x 61, 1, 3
+ li 3, 176
+ lfd 29, 392(1)
+ ld 26, 272(1)
+ ld 25, 264(1)
+ lxvd2x 60, 1, 3
+ li 3, 160
+ lfd 28, 384(1)
+ ld 24, 256(1)
+ ld 23, 248(1)
+ lxvd2x 59, 1, 3
+ li 3, 144
+ lfd 27, 376(1)
+ ld 22, 240(1)
+ lxvd2x 58, 1, 3
+ li 3, 128
+ lfd 26, 368(1)
+ lxvd2x 57, 1, 3
+ li 3, 112
+ lfd 25, 360(1)
+ lxvd2x 56, 1, 3
+ li 3, 96
+ lfd 24, 352(1)
+ lxvd2x 55, 1, 3
+ li 3, 80
+ lfd 23, 344(1)
+ lxvd2x 54, 1, 3
+ li 3, 64
+ lfd 22, 336(1)
+ lxvd2x 53, 1, 3
+ li 3, 48
+ lfd 21, 328(1)
+ lxvd2x 52, 1, 3
+ lfd 20, 320(1)
+ addi 1, 1, 416
+ blr
+ .long 0
+ .quad 0
+.Lfunc_end3:
+ .size blake3_hash4_sse41, .Lfunc_end3-.Lfunc_begin3
+ .cfi_endproc
+ .section ".note.GNU-stack","",@progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S
new file mode 100644
index 000000000..b15d8fc77
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S
@@ -0,0 +1,1845 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_AVX2)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_avx2
+.text
+
+.type zfs_blake3_hash_many_avx2,@function
+.p2align 6
+zfs_blake3_hash_many_avx2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 680
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ vmovd xmm0, r9d
+ vpbroadcastd ymm0, xmm0
+ vmovdqa ymmword ptr [rsp+0x280], ymm0
+ vpand ymm1, ymm0, ymmword ptr [ADD0+rip]
+ vpand ymm2, ymm0, ymmword ptr [ADD1+rip]
+ vmovdqa ymmword ptr [rsp+0x220], ymm2
+ vmovd xmm2, r8d
+ vpbroadcastd ymm2, xmm2
+ vpaddd ymm2, ymm2, ymm1
+ vmovdqa ymmword ptr [rsp+0x240], ymm2
+ vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm1, ymm2
+ shr r8, 32
+ vmovd xmm3, r8d
+ vpbroadcastd ymm3, xmm3
+ vpsubd ymm3, ymm3, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm3
+ shl rdx, 6
+ mov qword ptr [rsp+0x2A0], rdx
+ cmp rsi, 8
+ jc 3f
+2:
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x2A0]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x200], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x20], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x40], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x60], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x80], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0xA0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0xC0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0xE0], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x100], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x120], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x140], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x160], ymm11
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm8, ymm12, ymm14, 136
+ vmovaps ymmword ptr [rsp+0x180], ymm8
+ vshufps ymm9, ymm12, ymm14, 221
+ vmovaps ymmword ptr [rsp+0x1A0], ymm9
+ vshufps ymm10, ymm13, ymm15, 136
+ vmovaps ymmword ptr [rsp+0x1C0], ymm10
+ vshufps ymm11, ymm13, ymm15, 221
+ vmovaps ymmword ptr [rsp+0x1E0], ymm11
+ vpbroadcastd ymm15, dword ptr [rsp+0x200]
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm0, ymmword ptr [rsp+0x240]
+ vpxor ymm13, ymm1, ymmword ptr [rsp+0x260]
+ vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpxor ymm15, ymm3, ymm15
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip]
+ vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip]
+ vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip]
+ vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip]
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0]
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxor ymm12, ymm12, ymm0
+ vpxor ymm13, ymm13, ymm1
+ vpxor ymm14, ymm14, ymm2
+ vpxor ymm15, ymm15, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpshufb ymm15, ymm15, ymm8
+ vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxor ymm4, ymm4, ymm8
+ vpxor ymm5, ymm5, ymm9
+ vpxor ymm6, ymm6, ymm10
+ vpxor ymm7, ymm7, ymm11
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT16+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vmovdqa ymmword ptr [rsp+0x200], ymm8
+ vpsrld ymm8, ymm5, 12
+ vpslld ymm5, ymm5, 20
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 12
+ vpslld ymm6, ymm6, 20
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 12
+ vpslld ymm7, ymm7, 20
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 12
+ vpslld ymm4, ymm4, 20
+ vpor ymm4, ymm4, ymm8
+ vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140]
+ vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180]
+ vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80]
+ vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0]
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxor ymm15, ymm15, ymm0
+ vpxor ymm12, ymm12, ymm1
+ vpxor ymm13, ymm13, ymm2
+ vpxor ymm14, ymm14, ymm3
+ vbroadcasti128 ymm8, xmmword ptr [ROT8+rip]
+ vpshufb ymm15, ymm15, ymm8
+ vpshufb ymm12, ymm12, ymm8
+ vpshufb ymm13, ymm13, ymm8
+ vpshufb ymm14, ymm14, ymm8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200]
+ vpaddd ymm9, ymm9, ymm14
+ vpxor ymm5, ymm5, ymm10
+ vpxor ymm6, ymm6, ymm11
+ vpxor ymm7, ymm7, ymm8
+ vpxor ymm4, ymm4, ymm9
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpsrld ymm8, ymm5, 7
+ vpslld ymm5, ymm5, 25
+ vpor ymm5, ymm5, ymm8
+ vpsrld ymm8, ymm6, 7
+ vpslld ymm6, ymm6, 25
+ vpor ymm6, ymm6, ymm8
+ vpsrld ymm8, ymm7, 7
+ vpslld ymm7, ymm7, 25
+ vpor ymm7, ymm7, ymm8
+ vpsrld ymm8, ymm4, 7
+ vpslld ymm4, ymm4, 25
+ vpor ymm4, ymm4, ymm8
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp+0x220]
+ vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240]
+ vmovdqa ymmword ptr [rsp+0x240], ymm1
+ vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip]
+ vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip]
+ vpcmpgtd ymm2, ymm0, ymm2
+ vmovdqa ymm0, ymmword ptr [rsp+0x260]
+ vpsubd ymm2, ymm0, ymm2
+ vmovdqa ymmword ptr [rsp+0x260], ymm2
+ add rdi, 64
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 8
+ cmp rsi, 8
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x2A0]
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ test rsi, 0x4
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovdqa ymm8, ymm0
+ vmovdqa ymm9, ymm1
+ vbroadcasti128 ymm12, xmmword ptr [rsp+0x240]
+ vbroadcasti128 ymm13, xmmword ptr [rsp+0x260]
+ vpunpckldq ymm14, ymm12, ymm13
+ vpunpckhdq ymm15, ymm12, ymm13
+ vpermq ymm14, ymm14, 0x50
+ vpermq ymm15, ymm15, 0x50
+ vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpblendd ymm14, ymm14, ymm12, 0x44
+ vpblendd ymm15, ymm15, ymm12, 0x44
+ vmovdqa ymmword ptr [rsp], ymm14
+ vmovdqa ymmword ptr [rsp+0x20], ymm15
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vmovups ymm2, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm2, ymm3, 136
+ vshufps ymm5, ymm2, ymm3, 221
+ vmovups ymm2, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm3, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm2, ymm3, 136
+ vshufps ymm7, ymm2, ymm3, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ vmovups ymm10, ymmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01
+ vshufps ymm12, ymm10, ymm11, 136
+ vshufps ymm13, ymm10, ymm11, 221
+ vmovups ymm10, ymmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01
+ vmovups ymm11, ymmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01
+ vshufps ymm14, ymm10, ymm11, 136
+ vshufps ymm15, ymm10, ymm11, 221
+ vpshufd ymm14, ymm14, 0x93
+ vpshufd ymm15, ymm15, 0x93
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ vpbroadcastd ymm2, dword ptr [rsp+0x200]
+ vmovdqa ymm3, ymmword ptr [rsp]
+ vmovdqa ymm11, ymmword ptr [rsp+0x20]
+ vpblendd ymm3, ymm3, ymm2, 0x88
+ vpblendd ymm11, ymm11, ymm2, 0x88
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa ymm10, ymm2
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm8, ymm8, ymm12
+ vmovdqa ymmword ptr [rsp+0x40], ymm4
+ nop
+ vmovdqa ymmword ptr [rsp+0x60], ymm12
+ nop
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vmovdqa ymmword ptr [rsp+0x80], ymm5
+ vmovdqa ymmword ptr [rsp+0xA0], ymm13
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm8, ymm8, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpshufd ymm10, ymm10, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm8, ymm8, ymm14
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT16+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 12
+ vpslld ymm9, ymm9, 20
+ vpor ymm9, ymm9, ymm4
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm8, ymm8, ymm15
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm8, ymm8, ymm9
+ vpxor ymm3, ymm3, ymm0
+ vpxor ymm11, ymm11, ymm8
+ vbroadcasti128 ymm4, xmmword ptr [ROT8+rip]
+ vpshufb ymm3, ymm3, ymm4
+ vpshufb ymm11, ymm11, ymm4
+ vpaddd ymm2, ymm2, ymm3
+ vpaddd ymm10, ymm10, ymm11
+ vpxor ymm1, ymm1, ymm2
+ vpxor ymm9, ymm9, ymm10
+ vpsrld ymm4, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm4
+ vpsrld ymm4, ymm9, 7
+ vpslld ymm9, ymm9, 25
+ vpor ymm9, ymm9, ymm4
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm8, ymm8, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm11, ymm11, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ vpshufd ymm10, ymm10, 0x93
+ dec al
+ je 9f
+ vmovdqa ymm4, ymmword ptr [rsp+0x40]
+ vmovdqa ymm5, ymmword ptr [rsp+0x80]
+ vshufps ymm12, ymm4, ymm5, 214
+ vpshufd ymm13, ymm4, 0x0F
+ vpshufd ymm4, ymm12, 0x39
+ vshufps ymm12, ymm6, ymm7, 250
+ vpblendd ymm13, ymm13, ymm12, 0xAA
+ vpunpcklqdq ymm12, ymm7, ymm5
+ vpblendd ymm12, ymm12, ymm6, 0x88
+ vpshufd ymm12, ymm12, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymmword ptr [rsp+0x40], ymm13
+ vmovdqa ymmword ptr [rsp+0x80], ymm12
+ vmovdqa ymm12, ymmword ptr [rsp+0x60]
+ vmovdqa ymm13, ymmword ptr [rsp+0xA0]
+ vshufps ymm5, ymm12, ymm13, 214
+ vpshufd ymm6, ymm12, 0x0F
+ vpshufd ymm12, ymm5, 0x39
+ vshufps ymm5, ymm14, ymm15, 250
+ vpblendd ymm6, ymm6, ymm5, 0xAA
+ vpunpcklqdq ymm5, ymm15, ymm13
+ vpblendd ymm5, ymm5, ymm14, 0x88
+ vpshufd ymm5, ymm5, 0x78
+ vpunpckhdq ymm13, ymm13, ymm15
+ vpunpckldq ymm14, ymm14, ymm13
+ vpshufd ymm15, ymm14, 0x1E
+ vmovdqa ymm13, ymm6
+ vmovdqa ymm14, ymm5
+ vmovdqa ymm5, ymmword ptr [rsp+0x40]
+ vmovdqa ymm6, ymmword ptr [rsp+0x80]
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ vpxor ymm8, ymm8, ymm10
+ vpxor ymm9, ymm9, ymm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqu xmmword ptr [rbx+0x40], xmm8
+ vmovdqu xmmword ptr [rbx+0x50], xmm9
+ vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01
+ vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01
+ vmovaps xmm8, xmmword ptr [rsp+0x280]
+ vmovaps xmm0, xmmword ptr [rsp+0x240]
+ vmovaps xmm1, xmmword ptr [rsp+0x250]
+ vmovaps xmm2, xmmword ptr [rsp+0x260]
+ vmovaps xmm3, xmmword ptr [rsp+0x270]
+ vblendvps xmm0, xmm0, xmm1, xmm8
+ vblendvps xmm2, xmm2, xmm3, xmm8
+ vmovaps xmmword ptr [rsp+0x240], xmm0
+ vmovaps xmmword ptr [rsp+0x260], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test rsi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp+0x240]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x244]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ vbroadcasti128 ymm14, xmmword ptr [ROT16+rip]
+ vbroadcasti128 ymm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x200], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x200]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm14
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 12
+ vpslld ymm1, ymm1, 20
+ vpor ymm1, ymm1, ymm8
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxor ymm3, ymm3, ymm0
+ vpshufb ymm3, ymm3, ymm15
+ vpaddd ymm2, ymm2, ymm3
+ vpxor ymm1, ymm1, ymm2
+ vpsrld ymm8, ymm1, 7
+ vpslld ymm1, ymm1, 25
+ vpor ymm1, ymm1, ymm8
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovaps ymm8, ymmword ptr [rsp+0x280]
+ vmovaps ymm0, ymmword ptr [rsp+0x240]
+ vmovups ymm1, ymmword ptr [rsp+0x248]
+ vmovaps ymm2, ymmword ptr [rsp+0x260]
+ vmovups ymm3, ymmword ptr [rsp+0x268]
+ vblendvps ymm0, ymm0, ymm1, ymm8
+ vblendvps ymm2, ymm2, ymm3, ymm8
+ vmovaps ymmword ptr [rsp+0x240], ymm0
+ vmovaps ymmword ptr [rsp+0x260], ymm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test rsi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm3, dword ptr [rsp+0x240]
+ vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1
+ vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm14, xmmword ptr [ROT16+rip]
+ vmovdqa xmm15, xmmword ptr [ROT8+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovdqa xmm3, xmm13
+ vpinsrd xmm3, xmm3, eax, 3
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm14
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 12
+ vpslld xmm1, xmm1, 20
+ vpor xmm1, xmm1, xmm8
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxor xmm3, xmm3, xmm0
+ vpshufb xmm3, xmm3, xmm15
+ vpaddd xmm2, xmm2, xmm3
+ vpxor xmm1, xmm1, xmm2
+ vpsrld xmm8, xmm1, 7
+ vpslld xmm1, xmm1, 25
+ vpor xmm1, xmm1, xmm8
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+
+.p2align 6
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ADD1:
+ .long 8, 8, 8, 8, 8, 8, 8, 8
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ .long 0x00000040, 0x00000040, 0x00000040, 0x00000040
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
+#endif /* HAVE_AVX2 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S
new file mode 100644
index 000000000..d02c5e7ec
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S
@@ -0,0 +1,2618 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_avx512
+.global zfs_blake3_compress_in_place_avx512
+.global zfs_blake3_compress_xof_avx512
+.text
+
+.type zfs_blake3_hash_many_avx512,@function
+.type zfs_blake3_compress_xof_avx512,@function
+.type zfs_blake3_compress_in_place_avx512,@function
+
+.p2align 6
+zfs_blake3_hash_many_avx512:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 144
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9
+ kmovw k1, r9d
+ vmovd xmm0, r8d
+ vpbroadcastd ymm0, xmm0
+ shr r8, 32
+ vmovd xmm1, r8d
+ vpbroadcastd ymm1, xmm1
+ vmovdqa ymm4, ymm1
+ vmovdqa ymm5, ymm1
+ vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip]
+ vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip]
+ vpcmpltud k2, ymm2, ymm0
+ vpcmpltud k3, ymm3, ymm0
+ vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8}
+ vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8}
+ knotw k2, k1
+ vmovdqa32 ymm2 {k2}, ymm0
+ vmovdqa32 ymm3 {k2}, ymm0
+ vmovdqa32 ymm4 {k2}, ymm1
+ vmovdqa32 ymm5 {k2}, ymm1
+ vmovdqa ymmword ptr [rsp], ymm2
+ vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4
+ vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5
+ shl rdx, 6
+ mov qword ptr [rsp+0x80], rdx
+ cmp rsi, 16
+ jc 3f
+2:
+ vpbroadcastd zmm0, dword ptr [rcx]
+ vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4]
+ vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4]
+ vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4]
+ vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4]
+ vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4]
+ vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4]
+ vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+.p2align 5
+9:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm8, zmm16, zmm17
+ vpunpckhqdq zmm9, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm10, zmm18, zmm19
+ vpunpckhqdq zmm11, zmm18, zmm19
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
+ vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
+ vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
+ vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
+ vpunpcklqdq zmm12, zmm16, zmm17
+ vpunpckhqdq zmm13, zmm16, zmm17
+ vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
+ vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
+ vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
+ vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
+ vpunpcklqdq zmm14, zmm18, zmm19
+ vpunpckhqdq zmm15, zmm18, zmm19
+ vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
+ vmovdqa32 zmm31, zmmword ptr [INDEX1+rip]
+ vshufps zmm16, zmm8, zmm10, 136
+ vshufps zmm17, zmm12, zmm14, 136
+ vmovdqa32 zmm20, zmm16
+ vpermt2d zmm16, zmm27, zmm17
+ vpermt2d zmm20, zmm31, zmm17
+ vshufps zmm17, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm21, zmm17
+ vpermt2d zmm17, zmm27, zmm30
+ vpermt2d zmm21, zmm31, zmm30
+ vshufps zmm18, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm22, zmm18
+ vpermt2d zmm18, zmm27, zmm8
+ vpermt2d zmm22, zmm31, zmm8
+ vshufps zmm19, zmm9, zmm11, 221
+ vshufps zmm8, zmm13, zmm15, 221
+ vmovdqa32 zmm23, zmm19
+ vpermt2d zmm19, zmm27, zmm8
+ vpermt2d zmm23, zmm31, zmm8
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x40]
+ mov r13, qword ptr [rdi+0x48]
+ mov r14, qword ptr [rdi+0x50]
+ mov r15, qword ptr [rdi+0x58]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm8, zmm24, zmm25
+ vpunpckhqdq zmm9, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm10, zmm24, zmm25
+ vpunpckhqdq zmm11, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ mov r8, qword ptr [rdi+0x20]
+ mov r9, qword ptr [rdi+0x28]
+ mov r10, qword ptr [rdi+0x30]
+ mov r11, qword ptr [rdi+0x38]
+ mov r12, qword ptr [rdi+0x60]
+ mov r13, qword ptr [rdi+0x68]
+ mov r14, qword ptr [rdi+0x70]
+ mov r15, qword ptr [rdi+0x78]
+ vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm12, zmm24, zmm25
+ vpunpckhqdq zmm13, zmm24, zmm25
+ vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
+ vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
+ vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
+ vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
+ vpunpcklqdq zmm14, zmm24, zmm25
+ vpunpckhqdq zmm15, zmm24, zmm25
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r12+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r13+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r14+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ prefetcht0 [r15+rdx+0x80]
+ vshufps zmm24, zmm8, zmm10, 136
+ vshufps zmm30, zmm12, zmm14, 136
+ vmovdqa32 zmm28, zmm24
+ vpermt2d zmm24, zmm27, zmm30
+ vpermt2d zmm28, zmm31, zmm30
+ vshufps zmm25, zmm8, zmm10, 221
+ vshufps zmm30, zmm12, zmm14, 221
+ vmovdqa32 zmm29, zmm25
+ vpermt2d zmm25, zmm27, zmm30
+ vpermt2d zmm29, zmm31, zmm30
+ vshufps zmm26, zmm9, zmm11, 136
+ vshufps zmm8, zmm13, zmm15, 136
+ vmovdqa32 zmm30, zmm26
+ vpermt2d zmm26, zmm27, zmm8
+ vpermt2d zmm30, zmm31, zmm8
+ vshufps zmm8, zmm9, zmm11, 221
+ vshufps zmm10, zmm13, zmm15, 221
+ vpermi2d zmm27, zmm8, zmm10
+ vpermi2d zmm31, zmm8, zmm10
+ vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa32 zmm12, zmmword ptr [rsp]
+ vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40]
+ vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4]
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm24
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm23
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm17
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm29
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm22
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm27
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm21
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm30
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm20
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm21
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm16
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm28
+ vpaddd zmm1, zmm1, zmm25
+ vpaddd zmm2, zmm2, zmm31
+ vpaddd zmm3, zmm3, zmm30
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm26
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm23
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm16
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm18
+ vpaddd zmm1, zmm1, zmm19
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm25
+ vpaddd zmm1, zmm1, zmm27
+ vpaddd zmm2, zmm2, zmm24
+ vpaddd zmm3, zmm3, zmm31
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm28
+ vpaddd zmm3, zmm3, zmm17
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm29
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm18
+ vpaddd zmm3, zmm3, zmm20
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm19
+ vpaddd zmm1, zmm1, zmm26
+ vpaddd zmm2, zmm2, zmm22
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpaddd zmm0, zmm0, zmm27
+ vpaddd zmm1, zmm1, zmm21
+ vpaddd zmm2, zmm2, zmm17
+ vpaddd zmm3, zmm3, zmm24
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vprord zmm15, zmm15, 16
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 12
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vpaddd zmm0, zmm0, zmm31
+ vpaddd zmm1, zmm1, zmm16
+ vpaddd zmm2, zmm2, zmm25
+ vpaddd zmm3, zmm3, zmm22
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm1, zmm1, zmm5
+ vpaddd zmm2, zmm2, zmm6
+ vpaddd zmm3, zmm3, zmm7
+ vpxord zmm12, zmm12, zmm0
+ vpxord zmm13, zmm13, zmm1
+ vpxord zmm14, zmm14, zmm2
+ vpxord zmm15, zmm15, zmm3
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vprord zmm15, zmm15, 8
+ vpaddd zmm8, zmm8, zmm12
+ vpaddd zmm9, zmm9, zmm13
+ vpaddd zmm10, zmm10, zmm14
+ vpaddd zmm11, zmm11, zmm15
+ vpxord zmm4, zmm4, zmm8
+ vpxord zmm5, zmm5, zmm9
+ vpxord zmm6, zmm6, zmm10
+ vpxord zmm7, zmm7, zmm11
+ vprord zmm4, zmm4, 7
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vpaddd zmm0, zmm0, zmm30
+ vpaddd zmm1, zmm1, zmm18
+ vpaddd zmm2, zmm2, zmm19
+ vpaddd zmm3, zmm3, zmm23
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 16
+ vprord zmm12, zmm12, 16
+ vprord zmm13, zmm13, 16
+ vprord zmm14, zmm14, 16
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 12
+ vprord zmm6, zmm6, 12
+ vprord zmm7, zmm7, 12
+ vprord zmm4, zmm4, 12
+ vpaddd zmm0, zmm0, zmm26
+ vpaddd zmm1, zmm1, zmm28
+ vpaddd zmm2, zmm2, zmm20
+ vpaddd zmm3, zmm3, zmm29
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm1, zmm1, zmm6
+ vpaddd zmm2, zmm2, zmm7
+ vpaddd zmm3, zmm3, zmm4
+ vpxord zmm15, zmm15, zmm0
+ vpxord zmm12, zmm12, zmm1
+ vpxord zmm13, zmm13, zmm2
+ vpxord zmm14, zmm14, zmm3
+ vprord zmm15, zmm15, 8
+ vprord zmm12, zmm12, 8
+ vprord zmm13, zmm13, 8
+ vprord zmm14, zmm14, 8
+ vpaddd zmm10, zmm10, zmm15
+ vpaddd zmm11, zmm11, zmm12
+ vpaddd zmm8, zmm8, zmm13
+ vpaddd zmm9, zmm9, zmm14
+ vpxord zmm5, zmm5, zmm10
+ vpxord zmm6, zmm6, zmm11
+ vpxord zmm7, zmm7, zmm8
+ vpxord zmm4, zmm4, zmm9
+ vprord zmm5, zmm5, 7
+ vprord zmm6, zmm6, 7
+ vprord zmm7, zmm7, 7
+ vprord zmm4, zmm4, 7
+ vpxord zmm0, zmm0, zmm8
+ vpxord zmm1, zmm1, zmm9
+ vpxord zmm2, zmm2, zmm10
+ vpxord zmm3, zmm3, zmm11
+ vpxord zmm4, zmm4, zmm12
+ vpxord zmm5, zmm5, zmm13
+ vpxord zmm6, zmm6, zmm14
+ vpxord zmm7, zmm7, zmm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 9b
+ mov rbx, qword ptr [rbp+0x50]
+ vpunpckldq zmm16, zmm0, zmm1
+ vpunpckhdq zmm17, zmm0, zmm1
+ vpunpckldq zmm18, zmm2, zmm3
+ vpunpckhdq zmm19, zmm2, zmm3
+ vpunpckldq zmm20, zmm4, zmm5
+ vpunpckhdq zmm21, zmm4, zmm5
+ vpunpckldq zmm22, zmm6, zmm7
+ vpunpckhdq zmm23, zmm6, zmm7
+ vpunpcklqdq zmm0, zmm16, zmm18
+ vpunpckhqdq zmm1, zmm16, zmm18
+ vpunpcklqdq zmm2, zmm17, zmm19
+ vpunpckhqdq zmm3, zmm17, zmm19
+ vpunpcklqdq zmm4, zmm20, zmm22
+ vpunpckhqdq zmm5, zmm20, zmm22
+ vpunpcklqdq zmm6, zmm21, zmm23
+ vpunpckhqdq zmm7, zmm21, zmm23
+ vshufi32x4 zmm16, zmm0, zmm4, 0x88
+ vshufi32x4 zmm17, zmm1, zmm5, 0x88
+ vshufi32x4 zmm18, zmm2, zmm6, 0x88
+ vshufi32x4 zmm19, zmm3, zmm7, 0x88
+ vshufi32x4 zmm20, zmm0, zmm4, 0xDD
+ vshufi32x4 zmm21, zmm1, zmm5, 0xDD
+ vshufi32x4 zmm22, zmm2, zmm6, 0xDD
+ vshufi32x4 zmm23, zmm3, zmm7, 0xDD
+ vshufi32x4 zmm0, zmm16, zmm17, 0x88
+ vshufi32x4 zmm1, zmm18, zmm19, 0x88
+ vshufi32x4 zmm2, zmm20, zmm21, 0x88
+ vshufi32x4 zmm3, zmm22, zmm23, 0x88
+ vshufi32x4 zmm4, zmm16, zmm17, 0xDD
+ vshufi32x4 zmm5, zmm18, zmm19, 0xDD
+ vshufi32x4 zmm6, zmm20, zmm21, 0xDD
+ vshufi32x4 zmm7, zmm22, zmm23, 0xDD
+ vmovdqu32 zmmword ptr [rbx], zmm0
+ vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1
+ vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2
+ vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3
+ vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4
+ vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5
+ vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6
+ vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7
+ vmovdqa32 zmm0, zmmword ptr [rsp]
+ vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40]
+ vmovdqa32 zmm2, zmm0
+ vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16}
+ vpcmpltud k2, zmm2, zmm0
+ vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16}
+ vmovdqa32 zmmword ptr [rsp], zmm2
+ vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1
+ add rdi, 128
+ add rbx, 512
+ mov qword ptr [rbp+0x50], rbx
+ sub rsi, 16
+ cmp rsi, 16
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ vzeroupper
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 6
+3:
+ test esi, 0x8
+ je 3f
+ vpbroadcastd ymm0, dword ptr [rcx]
+ vpbroadcastd ymm1, dword ptr [rcx+0x4]
+ vpbroadcastd ymm2, dword ptr [rcx+0x8]
+ vpbroadcastd ymm3, dword ptr [rcx+0xC]
+ vpbroadcastd ymm4, dword ptr [rcx+0x10]
+ vpbroadcastd ymm5, dword ptr [rcx+0x14]
+ vpbroadcastd ymm6, dword ptr [rcx+0x18]
+ vpbroadcastd ymm7, dword ptr [rcx+0x1C]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov r12, qword ptr [rdi+0x20]
+ mov r13, qword ptr [rdi+0x28]
+ mov r14, qword ptr [rdi+0x30]
+ mov r15, qword ptr [rdi+0x38]
+ movzx eax, byte ptr [rbp+0x38]
+ movzx ebx, byte ptr [rbp+0x40]
+ or eax, ebx
+ xor edx, edx
+2:
+ movzx ebx, byte ptr [rbp+0x48]
+ or ebx, eax
+ add rdx, 64
+ cmp rdx, qword ptr [rsp+0x80]
+ cmove eax, ebx
+ mov dword ptr [rsp+0x88], eax
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x40]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x40]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x40]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm16, ymm12, ymm14, 136
+ vshufps ymm17, ymm12, ymm14, 221
+ vshufps ymm18, ymm13, ymm15, 136
+ vshufps ymm19, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x30]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x30]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm20, ymm12, ymm14, 136
+ vshufps ymm21, ymm12, ymm14, 221
+ vshufps ymm22, ymm13, ymm15, 136
+ vshufps ymm23, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x20]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x20]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x20]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm24, ymm12, ymm14, 136
+ vshufps ymm25, ymm12, ymm14, 221
+ vshufps ymm26, ymm13, ymm15, 136
+ vshufps ymm27, ymm13, ymm15, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01
+ vmovups xmm9, xmmword ptr [r9+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01
+ vunpcklpd ymm12, ymm8, ymm9
+ vunpckhpd ymm13, ymm8, ymm9
+ vmovups xmm10, xmmword ptr [r10+rdx-0x10]
+ vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01
+ vmovups xmm11, xmmword ptr [r11+rdx-0x10]
+ vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01
+ vunpcklpd ymm14, ymm10, ymm11
+ vunpckhpd ymm15, ymm10, ymm11
+ vshufps ymm28, ymm12, ymm14, 136
+ vshufps ymm29, ymm12, ymm14, 221
+ vshufps ymm30, ymm13, ymm15, 136
+ vshufps ymm31, ymm13, ymm15, 221
+ vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip]
+ vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip]
+ vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip]
+ vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip]
+ vmovdqa ymm12, ymmword ptr [rsp]
+ vmovdqa ymm13, ymmword ptr [rsp+0x40]
+ vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vpbroadcastd ymm15, dword ptr [rsp+0x88]
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm24
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm23
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm17
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm29
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm22
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm27
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm21
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm30
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm20
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm21
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm16
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm28
+ vpaddd ymm1, ymm1, ymm25
+ vpaddd ymm2, ymm2, ymm31
+ vpaddd ymm3, ymm3, ymm30
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm26
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm23
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm16
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm18
+ vpaddd ymm1, ymm1, ymm19
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm25
+ vpaddd ymm1, ymm1, ymm27
+ vpaddd ymm2, ymm2, ymm24
+ vpaddd ymm3, ymm3, ymm31
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm28
+ vpaddd ymm3, ymm3, ymm17
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm29
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm18
+ vpaddd ymm3, ymm3, ymm20
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm19
+ vpaddd ymm1, ymm1, ymm26
+ vpaddd ymm2, ymm2, ymm22
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpaddd ymm0, ymm0, ymm27
+ vpaddd ymm1, ymm1, ymm21
+ vpaddd ymm2, ymm2, ymm17
+ vpaddd ymm3, ymm3, ymm24
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vprord ymm15, ymm15, 16
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 12
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vpaddd ymm0, ymm0, ymm31
+ vpaddd ymm1, ymm1, ymm16
+ vpaddd ymm2, ymm2, ymm25
+ vpaddd ymm3, ymm3, ymm22
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm1, ymm1, ymm5
+ vpaddd ymm2, ymm2, ymm6
+ vpaddd ymm3, ymm3, ymm7
+ vpxord ymm12, ymm12, ymm0
+ vpxord ymm13, ymm13, ymm1
+ vpxord ymm14, ymm14, ymm2
+ vpxord ymm15, ymm15, ymm3
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vprord ymm15, ymm15, 8
+ vpaddd ymm8, ymm8, ymm12
+ vpaddd ymm9, ymm9, ymm13
+ vpaddd ymm10, ymm10, ymm14
+ vpaddd ymm11, ymm11, ymm15
+ vpxord ymm4, ymm4, ymm8
+ vpxord ymm5, ymm5, ymm9
+ vpxord ymm6, ymm6, ymm10
+ vpxord ymm7, ymm7, ymm11
+ vprord ymm4, ymm4, 7
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vpaddd ymm0, ymm0, ymm30
+ vpaddd ymm1, ymm1, ymm18
+ vpaddd ymm2, ymm2, ymm19
+ vpaddd ymm3, ymm3, ymm23
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 16
+ vprord ymm12, ymm12, 16
+ vprord ymm13, ymm13, 16
+ vprord ymm14, ymm14, 16
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 12
+ vprord ymm6, ymm6, 12
+ vprord ymm7, ymm7, 12
+ vprord ymm4, ymm4, 12
+ vpaddd ymm0, ymm0, ymm26
+ vpaddd ymm1, ymm1, ymm28
+ vpaddd ymm2, ymm2, ymm20
+ vpaddd ymm3, ymm3, ymm29
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm1, ymm1, ymm6
+ vpaddd ymm2, ymm2, ymm7
+ vpaddd ymm3, ymm3, ymm4
+ vpxord ymm15, ymm15, ymm0
+ vpxord ymm12, ymm12, ymm1
+ vpxord ymm13, ymm13, ymm2
+ vpxord ymm14, ymm14, ymm3
+ vprord ymm15, ymm15, 8
+ vprord ymm12, ymm12, 8
+ vprord ymm13, ymm13, 8
+ vprord ymm14, ymm14, 8
+ vpaddd ymm10, ymm10, ymm15
+ vpaddd ymm11, ymm11, ymm12
+ vpaddd ymm8, ymm8, ymm13
+ vpaddd ymm9, ymm9, ymm14
+ vpxord ymm5, ymm5, ymm10
+ vpxord ymm6, ymm6, ymm11
+ vpxord ymm7, ymm7, ymm8
+ vpxord ymm4, ymm4, ymm9
+ vprord ymm5, ymm5, 7
+ vprord ymm6, ymm6, 7
+ vprord ymm7, ymm7, 7
+ vprord ymm4, ymm4, 7
+ vpxor ymm0, ymm0, ymm8
+ vpxor ymm1, ymm1, ymm9
+ vpxor ymm2, ymm2, ymm10
+ vpxor ymm3, ymm3, ymm11
+ vpxor ymm4, ymm4, ymm12
+ vpxor ymm5, ymm5, ymm13
+ vpxor ymm6, ymm6, ymm14
+ vpxor ymm7, ymm7, ymm15
+ movzx eax, byte ptr [rbp+0x38]
+ jne 2b
+ mov rbx, qword ptr [rbp+0x50]
+ vunpcklps ymm8, ymm0, ymm1
+ vunpcklps ymm9, ymm2, ymm3
+ vunpckhps ymm10, ymm0, ymm1
+ vunpcklps ymm11, ymm4, ymm5
+ vunpcklps ymm0, ymm6, ymm7
+ vshufps ymm12, ymm8, ymm9, 78
+ vblendps ymm1, ymm8, ymm12, 0xCC
+ vshufps ymm8, ymm11, ymm0, 78
+ vunpckhps ymm13, ymm2, ymm3
+ vblendps ymm2, ymm11, ymm8, 0xCC
+ vblendps ymm3, ymm12, ymm9, 0xCC
+ vperm2f128 ymm12, ymm1, ymm2, 0x20
+ vmovups ymmword ptr [rbx], ymm12
+ vunpckhps ymm14, ymm4, ymm5
+ vblendps ymm4, ymm8, ymm0, 0xCC
+ vunpckhps ymm15, ymm6, ymm7
+ vperm2f128 ymm7, ymm3, ymm4, 0x20
+ vmovups ymmword ptr [rbx+0x20], ymm7
+ vshufps ymm5, ymm10, ymm13, 78
+ vblendps ymm6, ymm5, ymm13, 0xCC
+ vshufps ymm13, ymm14, ymm15, 78
+ vblendps ymm10, ymm10, ymm5, 0xCC
+ vblendps ymm14, ymm14, ymm13, 0xCC
+ vperm2f128 ymm8, ymm10, ymm14, 0x20
+ vmovups ymmword ptr [rbx+0x40], ymm8
+ vblendps ymm15, ymm13, ymm15, 0xCC
+ vperm2f128 ymm13, ymm6, ymm15, 0x20
+ vmovups ymmword ptr [rbx+0x60], ymm13
+ vperm2f128 ymm9, ymm1, ymm2, 0x31
+ vperm2f128 ymm11, ymm3, ymm4, 0x31
+ vmovups ymmword ptr [rbx+0x80], ymm9
+ vperm2f128 ymm14, ymm10, ymm14, 0x31
+ vperm2f128 ymm15, ymm6, ymm15, 0x31
+ vmovups ymmword ptr [rbx+0xA0], ymm11
+ vmovups ymmword ptr [rbx+0xC0], ymm14
+ vmovups ymmword ptr [rbx+0xE0], ymm15
+ vmovdqa ymm0, ymmword ptr [rsp]
+ vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20]
+ vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20]
+ vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20]
+ vmovdqa ymmword ptr [rsp], ymm0
+ vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2
+ add rbx, 256
+ mov qword ptr [rbp+0x50], rbx
+ add rdi, 64
+ sub rsi, 8
+3:
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, qword ptr [rsp+0x80]
+ movzx r13, byte ptr [rbp+0x38]
+ movzx r12, byte ptr [rbp+0x48]
+ test esi, 0x4
+ je 3f
+ vbroadcasti32x4 zmm0, xmmword ptr [rcx]
+ vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10]
+ vmovdqa xmm12, xmmword ptr [rsp]
+ vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10]
+ vpunpckldq xmm14, xmm12, xmm13
+ vpunpckhdq xmm15, xmm12, xmm13
+ vpermq ymm14, ymm14, 0xDC
+ vpermq ymm15, ymm15, 0xDC
+ vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
+ vinserti32x8 zmm13, zmm14, ymm15, 0x01
+ mov eax, 17476
+ kmovw k2, eax
+ vpblendmd zmm13 {k2}, zmm13, zmm12
+ vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ mov eax, 43690
+ kmovw k3, eax
+ mov eax, 34952
+ kmovw k4, eax
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vmovdqa32 zmm2, zmm15
+ vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4]
+ vpblendmd zmm3 {k4}, zmm13, zmm8
+ vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x30]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03
+ vshufps zmm4, zmm8, zmm9, 136
+ vshufps zmm5, zmm8, zmm9, 221
+ vmovups zmm8, zmmword ptr [r8+rdx-0x20]
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02
+ vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03
+ vmovups zmm9, zmmword ptr [r8+rdx-0x10]
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02
+ vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03
+ vshufps zmm6, zmm8, zmm9, 136
+ vshufps zmm7, zmm8, zmm9, 221
+ vpshufd zmm6, zmm6, 0x93
+ vpshufd zmm7, zmm7, 0x93
+ mov al, 7
+9:
+ vpaddd zmm0, zmm0, zmm4
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm5
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x93
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x39
+ vpaddd zmm0, zmm0, zmm6
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 16
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 12
+ vpaddd zmm0, zmm0, zmm7
+ vpaddd zmm0, zmm0, zmm1
+ vpxord zmm3, zmm3, zmm0
+ vprord zmm3, zmm3, 8
+ vpaddd zmm2, zmm2, zmm3
+ vpxord zmm1, zmm1, zmm2
+ vprord zmm1, zmm1, 7
+ vpshufd zmm0, zmm0, 0x39
+ vpshufd zmm3, zmm3, 0x4E
+ vpshufd zmm2, zmm2, 0x93
+ dec al
+ jz 9f
+ vshufps zmm8, zmm4, zmm5, 214
+ vpshufd zmm9, zmm4, 0x0F
+ vpshufd zmm4, zmm8, 0x39
+ vshufps zmm8, zmm6, zmm7, 250
+ vpblendmd zmm9 {k3}, zmm9, zmm8
+ vpunpcklqdq zmm8, zmm7, zmm5
+ vpblendmd zmm8 {k4}, zmm8, zmm6
+ vpshufd zmm8, zmm8, 0x78
+ vpunpckhdq zmm5, zmm5, zmm7
+ vpunpckldq zmm6, zmm6, zmm5
+ vpshufd zmm7, zmm6, 0x1E
+ vmovdqa32 zmm5, zmm9
+ vmovdqa32 zmm6, zmm8
+ jmp 9b
+9:
+ vpxord zmm0, zmm0, zmm2
+ vpxord zmm1, zmm1, zmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02
+ vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03
+ vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x40]
+ vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10]
+ vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x40], xmm2
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+3:
+ test esi, 0x2
+ je 3f
+ vbroadcasti128 ymm0, xmmword ptr [rcx]
+ vbroadcasti128 ymm1, xmmword ptr [rcx+0x10]
+ vmovd xmm13, dword ptr [rsp]
+ vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1
+ vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovd xmm14, dword ptr [rsp+0x4]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vinserti128 ymm13, ymm13, xmm14, 0x01
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ mov dword ptr [rsp+0x88], eax
+ vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip]
+ vpbroadcastd ymm8, dword ptr [rsp+0x88]
+ vpblendd ymm3, ymm13, ymm8, 0x88
+ vmovups ymm8, ymmword ptr [r8+rdx-0x40]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x30]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01
+ vshufps ymm4, ymm8, ymm9, 136
+ vshufps ymm5, ymm8, ymm9, 221
+ vmovups ymm8, ymmword ptr [r8+rdx-0x20]
+ vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01
+ vmovups ymm9, ymmword ptr [r8+rdx-0x10]
+ vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01
+ vshufps ymm6, ymm8, ymm9, 136
+ vshufps ymm7, ymm8, ymm9, 221
+ vpshufd ymm6, ymm6, 0x93
+ vpshufd ymm7, ymm7, 0x93
+ mov al, 7
+9:
+ vpaddd ymm0, ymm0, ymm4
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm5
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x93
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x39
+ vpaddd ymm0, ymm0, ymm6
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 16
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 12
+ vpaddd ymm0, ymm0, ymm7
+ vpaddd ymm0, ymm0, ymm1
+ vpxord ymm3, ymm3, ymm0
+ vprord ymm3, ymm3, 8
+ vpaddd ymm2, ymm2, ymm3
+ vpxord ymm1, ymm1, ymm2
+ vprord ymm1, ymm1, 7
+ vpshufd ymm0, ymm0, 0x39
+ vpshufd ymm3, ymm3, 0x4E
+ vpshufd ymm2, ymm2, 0x93
+ dec al
+ jz 9f
+ vshufps ymm8, ymm4, ymm5, 214
+ vpshufd ymm9, ymm4, 0x0F
+ vpshufd ymm4, ymm8, 0x39
+ vshufps ymm8, ymm6, ymm7, 250
+ vpblendd ymm9, ymm9, ymm8, 0xAA
+ vpunpcklqdq ymm8, ymm7, ymm5
+ vpblendd ymm8, ymm8, ymm6, 0x88
+ vpshufd ymm8, ymm8, 0x78
+ vpunpckhdq ymm5, ymm5, ymm7
+ vpunpckldq ymm6, ymm6, ymm5
+ vpshufd ymm7, ymm6, 0x1E
+ vmovdqa ymm5, ymm9
+ vmovdqa ymm6, ymm8
+ jmp 9b
+9:
+ vpxor ymm0, ymm0, ymm2
+ vpxor ymm1, ymm1, ymm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01
+ vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01
+ vmovdqa xmm0, xmmword ptr [rsp]
+ vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10]
+ vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8]
+ vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48]
+ vmovdqa xmmword ptr [rsp], xmm0
+ vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2
+ add rbx, 64
+ add rdi, 16
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ vmovdqu xmm0, xmmword ptr [rcx]
+ vmovdqu xmm1, xmmword ptr [rcx+0x10]
+ vmovd xmm14, dword ptr [rsp]
+ vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1
+ vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+.p2align 5
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ vpinsrd xmm3, xmm14, eax, 3
+ vmovdqa xmm2, xmm15
+ vmovups xmm8, xmmword ptr [r8+rdx-0x40]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x30]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [r8+rdx-0x20]
+ vmovups xmm9, xmmword ptr [r8+rdx-0x10]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ vmovdqu xmmword ptr [rbx], xmm0
+ vmovdqu xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+zfs_blake3_compress_in_place_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vmovdqu xmmword ptr [rdi], xmm0
+ vmovdqu xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+zfs_blake3_compress_xof_avx512:
+ _CET_ENDBR
+ vmovdqu xmm0, xmmword ptr [rdi]
+ vmovdqu xmm1, xmmword ptr [rdi+0x10]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ vmovq xmm3, rcx
+ vmovq xmm4, rdx
+ vpunpcklqdq xmm3, xmm3, xmm4
+ vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ vmovups xmm8, xmmword ptr [rsi]
+ vmovups xmm9, xmmword ptr [rsi+0x10]
+ vshufps xmm4, xmm8, xmm9, 136
+ vshufps xmm5, xmm8, xmm9, 221
+ vmovups xmm8, xmmword ptr [rsi+0x20]
+ vmovups xmm9, xmmword ptr [rsi+0x30]
+ vshufps xmm6, xmm8, xmm9, 136
+ vshufps xmm7, xmm8, xmm9, 221
+ vpshufd xmm6, xmm6, 0x93
+ vpshufd xmm7, xmm7, 0x93
+ mov al, 7
+9:
+ vpaddd xmm0, xmm0, xmm4
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm5
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x93
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x39
+ vpaddd xmm0, xmm0, xmm6
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 16
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 12
+ vpaddd xmm0, xmm0, xmm7
+ vpaddd xmm0, xmm0, xmm1
+ vpxord xmm3, xmm3, xmm0
+ vprord xmm3, xmm3, 8
+ vpaddd xmm2, xmm2, xmm3
+ vpxord xmm1, xmm1, xmm2
+ vprord xmm1, xmm1, 7
+ vpshufd xmm0, xmm0, 0x39
+ vpshufd xmm3, xmm3, 0x4E
+ vpshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ vshufps xmm8, xmm4, xmm5, 214
+ vpshufd xmm9, xmm4, 0x0F
+ vpshufd xmm4, xmm8, 0x39
+ vshufps xmm8, xmm6, xmm7, 250
+ vpblendd xmm9, xmm9, xmm8, 0xAA
+ vpunpcklqdq xmm8, xmm7, xmm5
+ vpblendd xmm8, xmm8, xmm6, 0x88
+ vpshufd xmm8, xmm8, 0x78
+ vpunpckhdq xmm5, xmm5, xmm7
+ vpunpckldq xmm6, xmm6, xmm5
+ vpshufd xmm7, xmm6, 0x1E
+ vmovdqa xmm5, xmm9
+ vmovdqa xmm6, xmm8
+ jmp 9b
+9:
+ vpxor xmm0, xmm0, xmm2
+ vpxor xmm1, xmm1, xmm3
+ vpxor xmm2, xmm2, [rdi]
+ vpxor xmm3, xmm3, [rdi+0x10]
+ vmovdqu xmmword ptr [r9], xmm0
+ vmovdqu xmmword ptr [r9+0x10], xmm1
+ vmovdqu xmmword ptr [r9+0x20], xmm2
+ vmovdqu xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512
+.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512
+.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+
+.p2align 6
+INDEX0:
+ .long 0, 1, 2, 3, 16, 17, 18, 19
+ .long 8, 9, 10, 11, 24, 25, 26, 27
+INDEX1:
+ .long 4, 5, 6, 7, 20, 21, 22, 23
+ .long 12, 13, 14, 15, 28, 29, 30, 31
+ADD0:
+ .long 0, 1, 2, 3, 4, 5, 6, 7
+ .long 8, 9, 10, 11, 12, 13, 14, 15
+ADD1: .long 1
+
+ADD16: .long 16
+BLAKE3_BLOCK_LEN:
+ .long 64
+.p2align 6
+BLAKE3_IV:
+BLAKE3_IV_0:
+ .long 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A
+
+#endif /* HAVE_AVX512 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S
new file mode 100644
index 000000000..39d23ee23
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S
@@ -0,0 +1,2323 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_SSE2)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_hash_many_sse2
+.global zfs_blake3_compress_in_place_sse2
+.global zfs_blake3_compress_xof_sse2
+
+.text
+.type zfs_blake3_hash_many_sse2,@function
+.type zfs_blake3_compress_in_place_sse2,@function
+.type zfs_blake3_compress_xof_sse2,@function
+
+ .p2align 6
+zfs_blake3_hash_many_sse2:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ pshuflw xmm15, xmm15, 0xB1
+ pshufhw xmm15, xmm15, 0xB1
+ pshuflw xmm12, xmm12, 0xB1
+ pshufhw xmm12, xmm12, 0xB1
+ pshuflw xmm13, xmm13, 0xB1
+ pshufhw xmm13, xmm13, 0xB1
+ pshuflw xmm14, xmm14, 0xB1
+ pshufhw xmm14, xmm14, 0xB1
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmm15
+ psrld xmm15, 8
+ pslld xmm8, 24
+ pxor xmm15, xmm8
+ movdqa xmm8, xmm12
+ psrld xmm12, 8
+ pslld xmm8, 24
+ pxor xmm12, xmm8
+ movdqa xmm8, xmm13
+ psrld xmm13, 8
+ pslld xmm8, 24
+ pxor xmm13, xmm8
+ movdqa xmm8, xmm14
+ psrld xmm14, 8
+ pslld xmm8, 24
+ pxor xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ movd xmm13, dword ptr [rsp+0x124]
+ punpckldq xmm14, xmm13
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ shl rax, 0x20
+ or rax, 0x40
+ movq xmm3, rax
+ movdqa xmmword ptr [rsp+0x20], xmm3
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ pshuflw xmm11, xmm11, 0xB1
+ pshufhw xmm11, xmm11, 0xB1
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movdqa xmm13, xmm3
+ psrld xmm3, 8
+ pslld xmm13, 24
+ pxor xmm3, xmm13
+ movdqa xmm13, xmm11
+ psrld xmm11, 8
+ pslld xmm13, 24
+ pxor xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm13, xmm12
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ movdqa xmm13, xmm6
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm12, xmm13
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm6, xmm5
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ movdqa xmmword ptr [rsp+0x30], xmm2
+ movdqa xmm2, xmm14
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm5, xmm2
+ movdqa xmm2, xmmword ptr [rsp+0x30]
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ mov eax, dword ptr [rsp+0x130]
+ neg eax
+ mov r10d, dword ptr [rsp+0x110+8*rax]
+ mov r11d, dword ptr [rsp+0x120+8*rax]
+ mov dword ptr [rsp+0x110], r10d
+ mov dword ptr [rsp+0x120], r11d
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ movd xmm14, dword ptr [rsp+0x120]
+ punpckldq xmm13, xmm14
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl rax, 32
+ or rax, 64
+ movq xmm12, rax
+ movdqa xmm3, xmm13
+ punpcklqdq xmm3, xmm12
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+
+.p2align 6
+zfs_blake3_compress_in_place_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+
+.p2align 6
+zfs_blake3_compress_xof_sse2:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshuflw xmm3, xmm3, 0xB1
+ pshufhw xmm3, xmm3, 0xB1
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ movdqa xmm14, xmm3
+ psrld xmm3, 8
+ pslld xmm14, 24
+ pxor xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
+ por xmm9, xmm8
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ movdqa xmm10, xmm6
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+ por xmm8, xmm10
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2
+.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2
+.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+PBLENDW_0x33_MASK:
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xCC_MASK:
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
+PBLENDW_0x3F_MASK:
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
+PBLENDW_0xC0_MASK:
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
+
+#endif /* HAVE_SSE2 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif
diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S
new file mode 100644
index 000000000..1c40236f0
--- /dev/null
+++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S
@@ -0,0 +1,2058 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
+ * Copyright (c) 2019-2020 Samuel Neves
+ * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ */
+
+#if defined(HAVE_SSE4_1)
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
+#if __has_include(<cet.h>)
+#include <cet.h>
+#endif
+#endif
+
+#if !defined(_CET_ENDBR)
+#define _CET_ENDBR
+#endif
+
+.intel_syntax noprefix
+.global zfs_blake3_compress_in_place_sse41
+.global zfs_blake3_compress_xof_sse41
+.global zfs_blake3_hash_many_sse41
+
+.text
+.type zfs_blake3_hash_many_sse41,@function
+.type zfs_blake3_compress_in_place_sse41,@function
+.type zfs_blake3_compress_xof_sse41,@function
+
+.p2align 6
+zfs_blake3_hash_many_sse41:
+ _CET_ENDBR
+ push r15
+ push r14
+ push r13
+ push r12
+ push rbx
+ push rbp
+ mov rbp, rsp
+ sub rsp, 360
+ and rsp, 0xFFFFFFFFFFFFFFC0
+ neg r9d
+ movd xmm0, r9d
+ pshufd xmm0, xmm0, 0x00
+ movdqa xmmword ptr [rsp+0x130], xmm0
+ movdqa xmm1, xmm0
+ pand xmm1, xmmword ptr [ADD0+rip]
+ pand xmm0, xmmword ptr [ADD1+rip]
+ movdqa xmmword ptr [rsp+0x150], xmm0
+ movd xmm0, r8d
+ pshufd xmm0, xmm0, 0x00
+ paddd xmm0, xmm1
+ movdqa xmmword ptr [rsp+0x110], xmm0
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm1, xmm0
+ shr r8, 32
+ movd xmm2, r8d
+ pshufd xmm2, xmm2, 0x00
+ psubd xmm2, xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ mov rbx, qword ptr [rbp+0x50]
+ mov r15, rdx
+ shl r15, 6
+ movzx r13d, byte ptr [rbp+0x38]
+ movzx r12d, byte ptr [rbp+0x48]
+ cmp rsi, 4
+ jc 3f
+2:
+ movdqu xmm3, xmmword ptr [rcx]
+ pshufd xmm0, xmm3, 0x00
+ pshufd xmm1, xmm3, 0x55
+ pshufd xmm2, xmm3, 0xAA
+ pshufd xmm3, xmm3, 0xFF
+ movdqu xmm7, xmmword ptr [rcx+0x10]
+ pshufd xmm4, xmm7, 0x00
+ pshufd xmm5, xmm7, 0x55
+ pshufd xmm6, xmm7, 0xAA
+ pshufd xmm7, xmm7, 0xFF
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ mov r10, qword ptr [rdi+0x10]
+ mov r11, qword ptr [rdi+0x18]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+9:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp], xmm8
+ movdqa xmmword ptr [rsp+0x10], xmm9
+ movdqa xmmword ptr [rsp+0x20], xmm12
+ movdqa xmmword ptr [rsp+0x30], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x40], xmm8
+ movdqa xmmword ptr [rsp+0x50], xmm9
+ movdqa xmmword ptr [rsp+0x60], xmm12
+ movdqa xmmword ptr [rsp+0x70], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0x80], xmm8
+ movdqa xmmword ptr [rsp+0x90], xmm9
+ movdqa xmmword ptr [rsp+0xA0], xmm12
+ movdqa xmmword ptr [rsp+0xB0], xmm13
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
+ movdqa xmm12, xmm8
+ punpckldq xmm8, xmm9
+ punpckhdq xmm12, xmm9
+ movdqa xmm14, xmm10
+ punpckldq xmm10, xmm11
+ punpckhdq xmm14, xmm11
+ movdqa xmm9, xmm8
+ punpcklqdq xmm8, xmm10
+ punpckhqdq xmm9, xmm10
+ movdqa xmm13, xmm12
+ punpcklqdq xmm12, xmm14
+ punpckhqdq xmm13, xmm14
+ movdqa xmmword ptr [rsp+0xC0], xmm8
+ movdqa xmmword ptr [rsp+0xD0], xmm9
+ movdqa xmmword ptr [rsp+0xE0], xmm12
+ movdqa xmmword ptr [rsp+0xF0], xmm13
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
+ movdqa xmm12, xmmword ptr [rsp+0x110]
+ movdqa xmm13, xmmword ptr [rsp+0x120]
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
+ movd xmm15, eax
+ pshufd xmm15, xmm15, 0x00
+ prefetcht0 [r8+rdx+0x80]
+ prefetcht0 [r9+rdx+0x80]
+ prefetcht0 [r10+rdx+0x80]
+ prefetcht0 [r11+rdx+0x80]
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x80]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x70]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x10]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0xD0]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x60]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xB0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x50]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0xE0]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x40]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x50]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xC0]
+ paddd xmm1, xmmword ptr [rsp+0x90]
+ paddd xmm2, xmmword ptr [rsp+0xF0]
+ paddd xmm3, xmmword ptr [rsp+0xE0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0xA0]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x70]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x20]
+ paddd xmm1, xmmword ptr [rsp+0x30]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x90]
+ paddd xmm1, xmmword ptr [rsp+0xB0]
+ paddd xmm2, xmmword ptr [rsp+0x80]
+ paddd xmm3, xmmword ptr [rsp+0xF0]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0xC0]
+ paddd xmm3, xmmword ptr [rsp+0x10]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xD0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x20]
+ paddd xmm3, xmmword ptr [rsp+0x40]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0x30]
+ paddd xmm1, xmmword ptr [rsp+0xA0]
+ paddd xmm2, xmmword ptr [rsp+0x60]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xB0]
+ paddd xmm1, xmmword ptr [rsp+0x50]
+ paddd xmm2, xmmword ptr [rsp+0x10]
+ paddd xmm3, xmmword ptr [rsp+0x80]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xF0]
+ paddd xmm1, xmmword ptr [rsp]
+ paddd xmm2, xmmword ptr [rsp+0x90]
+ paddd xmm3, xmmword ptr [rsp+0x60]
+ paddd xmm0, xmm4
+ paddd xmm1, xmm5
+ paddd xmm2, xmm6
+ paddd xmm3, xmm7
+ pxor xmm12, xmm0
+ pxor xmm13, xmm1
+ pxor xmm14, xmm2
+ pxor xmm15, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ pshufb xmm15, xmm8
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm12
+ paddd xmm9, xmm13
+ paddd xmm10, xmm14
+ paddd xmm11, xmm15
+ pxor xmm4, xmm8
+ pxor xmm5, xmm9
+ pxor xmm6, xmm10
+ pxor xmm7, xmm11
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xE0]
+ paddd xmm1, xmmword ptr [rsp+0x20]
+ paddd xmm2, xmmword ptr [rsp+0x30]
+ paddd xmm3, xmmword ptr [rsp+0x70]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT16+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ movdqa xmmword ptr [rsp+0x100], xmm8
+ movdqa xmm8, xmm5
+ psrld xmm8, 12
+ pslld xmm5, 20
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 12
+ pslld xmm6, 20
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 12
+ pslld xmm7, 20
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 12
+ pslld xmm4, 20
+ por xmm4, xmm8
+ paddd xmm0, xmmword ptr [rsp+0xA0]
+ paddd xmm1, xmmword ptr [rsp+0xC0]
+ paddd xmm2, xmmword ptr [rsp+0x40]
+ paddd xmm3, xmmword ptr [rsp+0xD0]
+ paddd xmm0, xmm5
+ paddd xmm1, xmm6
+ paddd xmm2, xmm7
+ paddd xmm3, xmm4
+ pxor xmm15, xmm0
+ pxor xmm12, xmm1
+ pxor xmm13, xmm2
+ pxor xmm14, xmm3
+ movdqa xmm8, xmmword ptr [ROT8+rip]
+ pshufb xmm15, xmm8
+ pshufb xmm12, xmm8
+ pshufb xmm13, xmm8
+ pshufb xmm14, xmm8
+ paddd xmm10, xmm15
+ paddd xmm11, xmm12
+ movdqa xmm8, xmmword ptr [rsp+0x100]
+ paddd xmm8, xmm13
+ paddd xmm9, xmm14
+ pxor xmm5, xmm10
+ pxor xmm6, xmm11
+ pxor xmm7, xmm8
+ pxor xmm4, xmm9
+ pxor xmm0, xmm8
+ pxor xmm1, xmm9
+ pxor xmm2, xmm10
+ pxor xmm3, xmm11
+ movdqa xmm8, xmm5
+ psrld xmm8, 7
+ pslld xmm5, 25
+ por xmm5, xmm8
+ movdqa xmm8, xmm6
+ psrld xmm8, 7
+ pslld xmm6, 25
+ por xmm6, xmm8
+ movdqa xmm8, xmm7
+ psrld xmm8, 7
+ pslld xmm7, 25
+ por xmm7, xmm8
+ movdqa xmm8, xmm4
+ psrld xmm8, 7
+ pslld xmm4, 25
+ por xmm4, xmm8
+ pxor xmm4, xmm12
+ pxor xmm5, xmm13
+ pxor xmm6, xmm14
+ pxor xmm7, xmm15
+ mov eax, r13d
+ jne 9b
+ movdqa xmm9, xmm0
+ punpckldq xmm0, xmm1
+ punpckhdq xmm9, xmm1
+ movdqa xmm11, xmm2
+ punpckldq xmm2, xmm3
+ punpckhdq xmm11, xmm3
+ movdqa xmm1, xmm0
+ punpcklqdq xmm0, xmm2
+ punpckhqdq xmm1, xmm2
+ movdqa xmm3, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm3, xmm11
+ movdqu xmmword ptr [rbx], xmm0
+ movdqu xmmword ptr [rbx+0x20], xmm1
+ movdqu xmmword ptr [rbx+0x40], xmm9
+ movdqu xmmword ptr [rbx+0x60], xmm3
+ movdqa xmm9, xmm4
+ punpckldq xmm4, xmm5
+ punpckhdq xmm9, xmm5
+ movdqa xmm11, xmm6
+ punpckldq xmm6, xmm7
+ punpckhdq xmm11, xmm7
+ movdqa xmm5, xmm4
+ punpcklqdq xmm4, xmm6
+ punpckhqdq xmm5, xmm6
+ movdqa xmm7, xmm9
+ punpcklqdq xmm9, xmm11
+ punpckhqdq xmm7, xmm11
+ movdqu xmmword ptr [rbx+0x10], xmm4
+ movdqu xmmword ptr [rbx+0x30], xmm5
+ movdqu xmmword ptr [rbx+0x50], xmm9
+ movdqu xmmword ptr [rbx+0x70], xmm7
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm0, xmm1
+ paddd xmm1, xmmword ptr [rsp+0x150]
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
+ pcmpgtd xmm0, xmm1
+ movdqa xmm1, xmmword ptr [rsp+0x120]
+ psubd xmm1, xmm0
+ movdqa xmmword ptr [rsp+0x120], xmm1
+ add rbx, 128
+ add rdi, 32
+ sub rsi, 4
+ cmp rsi, 4
+ jnc 2b
+ test rsi, rsi
+ jnz 3f
+4:
+ mov rsp, rbp
+ pop rbp
+ pop rbx
+ pop r12
+ pop r13
+ pop r14
+ pop r15
+ ret
+.p2align 5
+3:
+ test esi, 0x2
+ je 3f
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movaps xmm8, xmm0
+ movaps xmm9, xmm1
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp], xmm13
+ movd xmm14, dword ptr [rsp+0x114]
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmmword ptr [rsp+0x10], xmm14
+ mov r8, qword ptr [rdi]
+ mov r9, qword ptr [rdi+0x8]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm10, xmm2
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm3, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm3, xmm5, 221
+ movaps xmm5, xmm3
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm3, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm3, xmm7, 221
+ pshufd xmm7, xmm3, 0x93
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
+ movaps xmm11, xmm12
+ shufps xmm12, xmm13, 136
+ shufps xmm11, xmm13, 221
+ movaps xmm13, xmm11
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
+ movaps xmm11, xmm14
+ shufps xmm14, xmm15, 136
+ pshufd xmm14, xmm14, 0x93
+ shufps xmm11, xmm15, 221
+ pshufd xmm15, xmm11, 0x93
+ movaps xmm3, xmmword ptr [rsp]
+ movaps xmm11, xmmword ptr [rsp+0x10]
+ pinsrd xmm3, eax, 3
+ pinsrd xmm11, eax, 3
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm8, xmm12
+ movaps xmmword ptr [rsp+0x20], xmm4
+ movaps xmmword ptr [rsp+0x30], xmm12
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm12, xmmword ptr [ROT16+rip]
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm5
+ paddd xmm8, xmm13
+ movaps xmmword ptr [rsp+0x40], xmm5
+ movaps xmmword ptr [rsp+0x50], xmm13
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ movaps xmm13, xmmword ptr [ROT8+rip]
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm8, xmm8, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ pshufd xmm10, xmm10, 0x39
+ paddd xmm0, xmm6
+ paddd xmm8, xmm14
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm12
+ pshufb xmm11, xmm12
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 20
+ psrld xmm4, 12
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 20
+ psrld xmm4, 12
+ por xmm9, xmm4
+ paddd xmm0, xmm7
+ paddd xmm8, xmm15
+ paddd xmm0, xmm1
+ paddd xmm8, xmm9
+ pxor xmm3, xmm0
+ pxor xmm11, xmm8
+ pshufb xmm3, xmm13
+ pshufb xmm11, xmm13
+ paddd xmm2, xmm3
+ paddd xmm10, xmm11
+ pxor xmm1, xmm2
+ pxor xmm9, xmm10
+ movdqa xmm4, xmm1
+ pslld xmm1, 25
+ psrld xmm4, 7
+ por xmm1, xmm4
+ movdqa xmm4, xmm9
+ pslld xmm9, 25
+ psrld xmm4, 7
+ por xmm9, xmm4
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm8, xmm8, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm11, xmm11, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ pshufd xmm10, xmm10, 0x93
+ dec al
+ je 9f
+ movdqa xmm12, xmmword ptr [rsp+0x20]
+ movdqa xmm5, xmmword ptr [rsp+0x40]
+ pshufd xmm13, xmm12, 0x0F
+ shufps xmm12, xmm5, 214
+ pshufd xmm4, xmm12, 0x39
+ movdqa xmm12, xmm6
+ shufps xmm12, xmm7, 250
+ pblendw xmm13, xmm12, 0xCC
+ movdqa xmm12, xmm7
+ punpcklqdq xmm12, xmm5
+ pblendw xmm12, xmm6, 0xC0
+ pshufd xmm12, xmm12, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmmword ptr [rsp+0x20], xmm13
+ movdqa xmmword ptr [rsp+0x40], xmm12
+ movdqa xmm5, xmmword ptr [rsp+0x30]
+ movdqa xmm13, xmmword ptr [rsp+0x50]
+ pshufd xmm6, xmm5, 0x0F
+ shufps xmm5, xmm13, 214
+ pshufd xmm12, xmm5, 0x39
+ movdqa xmm5, xmm14
+ shufps xmm5, xmm15, 250
+ pblendw xmm6, xmm5, 0xCC
+ movdqa xmm5, xmm15
+ punpcklqdq xmm5, xmm13
+ pblendw xmm5, xmm14, 0xC0
+ pshufd xmm5, xmm5, 0x78
+ punpckhdq xmm13, xmm15
+ punpckldq xmm14, xmm13
+ pshufd xmm15, xmm14, 0x1E
+ movdqa xmm13, xmm6
+ movdqa xmm14, xmm5
+ movdqa xmm5, xmmword ptr [rsp+0x20]
+ movdqa xmm6, xmmword ptr [rsp+0x40]
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm8, xmm10
+ pxor xmm9, xmm11
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ movups xmmword ptr [rbx+0x20], xmm8
+ movups xmmword ptr [rbx+0x30], xmm9
+ movdqa xmm0, xmmword ptr [rsp+0x130]
+ movdqa xmm1, xmmword ptr [rsp+0x110]
+ movdqa xmm2, xmmword ptr [rsp+0x120]
+ movdqu xmm3, xmmword ptr [rsp+0x118]
+ movdqu xmm4, xmmword ptr [rsp+0x128]
+ blendvps xmm1, xmm3, xmm0
+ blendvps xmm2, xmm4, xmm0
+ movdqa xmmword ptr [rsp+0x110], xmm1
+ movdqa xmmword ptr [rsp+0x120], xmm2
+ add rdi, 16
+ add rbx, 64
+ sub rsi, 2
+3:
+ test esi, 0x1
+ je 4b
+ movups xmm0, xmmword ptr [rcx]
+ movups xmm1, xmmword ptr [rcx+0x10]
+ movd xmm13, dword ptr [rsp+0x110]
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov r8, qword ptr [rdi]
+ movzx eax, byte ptr [rbp+0x40]
+ or eax, r13d
+ xor edx, edx
+2:
+ mov r14d, eax
+ or eax, r12d
+ add rdx, 64
+ cmp rdx, r15
+ cmovne eax, r14d
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movaps xmm3, xmm13
+ pinsrd xmm3, eax, 3
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ mov eax, r13d
+ cmp rdx, r15
+ jne 2b
+ movups xmmword ptr [rbx], xmm0
+ movups xmmword ptr [rbx+0x10], xmm1
+ jmp 4b
+.p2align 6
+zfs_blake3_compress_in_place_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ shl r8, 32
+ add rdx, r8
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ movups xmmword ptr [rdi], xmm0
+ movups xmmword ptr [rdi+0x10], xmm1
+ ret
+.p2align 6
+zfs_blake3_compress_xof_sse41:
+ _CET_ENDBR
+ movups xmm0, xmmword ptr [rdi]
+ movups xmm1, xmmword ptr [rdi+0x10]
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
+ movzx eax, r8b
+ movzx edx, dl
+ shl rax, 32
+ add rdx, rax
+ movq xmm3, rcx
+ movq xmm4, rdx
+ punpcklqdq xmm3, xmm4
+ movups xmm4, xmmword ptr [rsi]
+ movups xmm5, xmmword ptr [rsi+0x10]
+ movaps xmm8, xmm4
+ shufps xmm4, xmm5, 136
+ shufps xmm8, xmm5, 221
+ movaps xmm5, xmm8
+ movups xmm6, xmmword ptr [rsi+0x20]
+ movups xmm7, xmmword ptr [rsi+0x30]
+ movaps xmm8, xmm6
+ shufps xmm6, xmm7, 136
+ pshufd xmm6, xmm6, 0x93
+ shufps xmm8, xmm7, 221
+ pshufd xmm7, xmm8, 0x93
+ movaps xmm14, xmmword ptr [ROT8+rip]
+ movaps xmm15, xmmword ptr [ROT16+rip]
+ mov al, 7
+9:
+ paddd xmm0, xmm4
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm5
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x93
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x39
+ paddd xmm0, xmm6
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm15
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 20
+ psrld xmm11, 12
+ por xmm1, xmm11
+ paddd xmm0, xmm7
+ paddd xmm0, xmm1
+ pxor xmm3, xmm0
+ pshufb xmm3, xmm14
+ paddd xmm2, xmm3
+ pxor xmm1, xmm2
+ movdqa xmm11, xmm1
+ pslld xmm1, 25
+ psrld xmm11, 7
+ por xmm1, xmm11
+ pshufd xmm0, xmm0, 0x39
+ pshufd xmm3, xmm3, 0x4E
+ pshufd xmm2, xmm2, 0x93
+ dec al
+ jz 9f
+ movdqa xmm8, xmm4
+ shufps xmm8, xmm5, 214
+ pshufd xmm9, xmm4, 0x0F
+ pshufd xmm4, xmm8, 0x39
+ movdqa xmm8, xmm6
+ shufps xmm8, xmm7, 250
+ pblendw xmm9, xmm8, 0xCC
+ movdqa xmm8, xmm7
+ punpcklqdq xmm8, xmm5
+ pblendw xmm8, xmm6, 0xC0
+ pshufd xmm8, xmm8, 0x78
+ punpckhdq xmm5, xmm7
+ punpckldq xmm6, xmm5
+ pshufd xmm7, xmm6, 0x1E
+ movdqa xmm5, xmm9
+ movdqa xmm6, xmm8
+ jmp 9b
+9:
+ movdqu xmm4, xmmword ptr [rdi]
+ movdqu xmm5, xmmword ptr [rdi+0x10]
+ pxor xmm0, xmm2
+ pxor xmm1, xmm3
+ pxor xmm2, xmm4
+ pxor xmm3, xmm5
+ movups xmmword ptr [r9], xmm0
+ movups xmmword ptr [r9+0x10], xmm1
+ movups xmmword ptr [r9+0x20], xmm2
+ movups xmmword ptr [r9+0x30], xmm3
+ ret
+
+.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41
+.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41
+.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41
+
+#ifdef __APPLE__
+.static_data
+#else
+.section .rodata
+#endif
+.p2align 6
+BLAKE3_IV:
+ .long 0x6A09E667, 0xBB67AE85
+ .long 0x3C6EF372, 0xA54FF53A
+ROT16:
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
+ROT8:
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
+ADD0:
+ .long 0, 1, 2, 3
+ADD1:
+ .long 4, 4, 4, 4
+BLAKE3_IV_0:
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
+BLAKE3_IV_1:
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
+BLAKE3_IV_2:
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
+BLAKE3_IV_3:
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
+BLAKE3_BLOCK_LEN:
+ .long 64, 64, 64, 64
+CMP_MSB_MASK:
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
+
+#endif /* HAVE_SSE4_1 */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif