diff options
Diffstat (limited to 'module/icp')
-rw-r--r-- | module/icp/algs/blake3/blake3.c | 732 | ||||
-rw-r--r-- | module/icp/algs/blake3/blake3_generic.c | 202 | ||||
-rw-r--r-- | module/icp/algs/blake3/blake3_impl.c | 256 | ||||
-rw-r--r-- | module/icp/algs/blake3/blake3_impl.h | 213 | ||||
-rw-r--r-- | module/icp/algs/blake3/blake3_x86-64.c | 248 | ||||
-rw-r--r-- | module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 2450 | ||||
-rw-r--r-- | module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 2463 | ||||
-rw-r--r-- | module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S | 2823 | ||||
-rw-r--r-- | module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S | 3064 | ||||
-rw-r--r-- | module/icp/asm-x86_64/blake3/blake3_avx2.S | 1845 | ||||
-rw-r--r-- | module/icp/asm-x86_64/blake3/blake3_avx512.S | 2618 | ||||
-rw-r--r-- | module/icp/asm-x86_64/blake3/blake3_sse2.S | 2323 | ||||
-rw-r--r-- | module/icp/asm-x86_64/blake3/blake3_sse41.S | 2058 |
13 files changed, 21295 insertions, 0 deletions
diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c new file mode 100644 index 000000000..8c9c06eb9 --- /dev/null +++ b/module/icp/algs/blake3/blake3.c @@ -0,0 +1,732 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include <sys/blake3.h> + +#include "blake3_impl.h" + +/* + * We need 1056 byte stack for blake3_compress_subtree_wide() + * - we define this pragma to make gcc happy + */ +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +/* internal used */ +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +/* internal flags */ +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +/* internal start */ +static void chunk_state_init(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint8_t flags) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; + ctx->blocks_compressed = 0; + ctx->flags = flags; +} + +static void chunk_state_reset(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint64_t chunk_counter) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = chunk_counter; + ctx->blocks_compressed = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; +} + +static size_t chunk_state_len(const blake3_chunk_state_t *ctx) +{ + return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) + + ((size_t)ctx->buf_len); +} + +static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx, + const uint8_t *input, size_t input_len) +{ + size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len); + memcpy(dest, input, take); + ctx->buf_len += (uint8_t)take; + return (take); +} + +static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx) +{ + if (ctx->blocks_compressed == 0) { + return (CHUNK_START); + } else { + return (0); + } +} + +static output_t make_output(const uint32_t input_cv[8], + const uint8_t *block, uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return (ret); +} + +/* + * Chaining values within a given chunk (specifically the compress_in_place + * interface) are represented as words. This avoids unnecessary bytes<->words + * conversion overhead in the portable implementation. However, the hash_many + * interface handles both user input and parent node blocks, so it accepts + * bytes. For that reason, chaining values in the CV stack are represented as + * bytes. + */ +static void output_chaining_value(const blake3_impl_ops_t *ops, + const output_t *ctx, uint8_t cv[32]) +{ + uint32_t cv_words[8]; + memcpy(cv_words, ctx->input_cv, 32); + ops->compress_in_place(cv_words, ctx->block, ctx->block_len, + ctx->counter, ctx->flags); + store_cv_words(cv, cv_words); +} + +static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, + uint64_t seek, uint8_t *out, size_t out_len) +{ + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len, + output_block_counter, ctx->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +static void chunk_state_update(const blake3_impl_ops_t *ops, + blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) +{ + if (ctx->buf_len > 0) { + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + ops->compress_in_place(ctx->cv, ctx->buf, + BLAKE3_BLOCK_LEN, ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + ctx->buf_len = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN, + ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; +} + +static output_t chunk_state_output(const blake3_chunk_state_t *ctx) +{ + uint8_t block_flags = + ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END; + return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter, + block_flags)); +} + +static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) +{ + return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT)); +} + +/* + * Given some input larger than one chunk, return the number of bytes that + * should go in the left subtree. This is the largest power-of-2 number of + * chunks that leaves at least 1 byte for the right subtree. + */ +static size_t left_len(size_t content_len) +{ + /* + * Subtract 1 to reserve at least one byte for the right side. + * content_len + * should always be greater than BLAKE3_CHUNK_LEN. + */ + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN); +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time + * on a single thread. Write out the chunk chaining values and return the + * number of chunks hashed. These chunks are never the root and never empty; + * those cases use a different codepath. + */ +static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / + BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START, + CHUNK_END, out); + + /* + * Hash the remaining partial chunk, if there is one. Note that the + * empty chunk (meaning the empty message) is a different codepath. + */ + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(ops, &chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(ops, &output, &out[chunks_array_len * + BLAKE3_OUT_LEN]); + return (chunks_array_len + 1); + } else { + return (chunks_array_len); + } +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time + * on a single thread. Write out the parent chaining values and return the + * number of parents hashed. (If there's an odd input chaining value left over, + * return it as an additional output.) These parents are never the root and + * never empty; those cases use a different codepath. + */ +static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, + const uint8_t *child_chaining_values, size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, uint8_t *out) +{ + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = &child_chaining_values[2 * + parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE, + flags | PARENT, 0, 0, out); + + /* If there's an odd child left over, it becomes an output. */ + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * + BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); + return (parents_array_len + 1); + } else { + return (parents_array_len); + } +} + +/* + * The wide helper function returns (writes out) an array of chaining values + * and returns the length of that array. The number of chaining values returned + * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, + * if the input is shorter than that many chunks. The reason for maintaining a + * wide array of chaining values going back up the tree, is to allow the + * implementation to hash as many parents in parallel as possible. + * + * As a special case when the SIMD degree is 1, this function will still return + * at least 2 outputs. This guarantees that this function doesn't perform the + * root compression. (If it did, it would use the wrong flags, and also we + * wouldn't be able to implement exendable ouput.) Note that this function is + * not used when the whole input is only 1 chunk long; that's a different + * codepath. + * + * Why not just have the caller split the input on the first update(), instead + * of implementing this special rule? Because we don't want to limit SIMD or + * multi-threading parallelism for that update(). + */ +static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + /* + * Note that the single chunk case does *not* bump the SIMD degree up + * to 2 when it is 1. If this implementation adds multi-threading in + * the future, this gives us the option of multi-threading even the + * 2-chunk case, which can help performance on smaller platforms. + */ + if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) { + return (compress_chunks_parallel(ops, input, input_len, key, + chunk_counter, flags, out)); + } + + + /* + * With more than simd_degree chunks, we need to recurse. Start by + * dividing the input into left and right subtrees. (Note that this is + * only optimal as long as the SIMD degree is a power of 2. If we ever + * get a SIMD degree of 3 or something, we'll need a more complicated + * strategy.) + */ + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = chunk_counter + + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + /* + * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 + * to account for the special case of returning 2 outputs when the + * SIMD degree is 1. + */ + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = ops->degree; + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + + /* + * The special case: We always use a degree of at least two, + * to make sure there are two outputs. Except, as noted above, + * at the chunk level, where we allow degree=1. (Note that the + * 1-chunk-input case is a different codepath.) + */ + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + /* + * Recurse! If this implementation adds multi-threading support in the + * future, this is where it will go. + */ + size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len, + key, chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide(ops, right_input, + right_input_len, key, right_chunk_counter, flags, right_cvs); + + /* + * The special case again. If simd_degree=1, then we'll have left_n=1 + * and right_n=1. Rather than compressing them into a single output, + * return them directly, to make sure we always have at least two + * outputs. + */ + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return (2); + } + + /* Otherwise, do one layer of parent node compression. */ + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(ops, cv_array, + num_chaining_values, key, flags, out); +} + +/* + * Hash a subtree with compress_subtree_wide(), and then condense the resulting + * list of chaining values down to a single parent node. Don't compress that + * last parent node, however. Instead, return its message bytes (the + * concatenated chaining values of its children). This is necessary when the + * first call to update() supplies a complete subtree, because the topmost + * parent node of that subtree could end up being the root. It's also necessary + * for extended output in the general case. + * + * As with compress_subtree_wide(), this function is not used on inputs of 1 + * chunk or less. That's a different codepath. + */ +static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len, + key, chunk_counter, flags, cv_array); + + /* + * If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + * compress_subtree_wide() returns more than 2 chaining values. Condense + * them into 2 by forming parent nodes repeatedly. + */ + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key, + flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], + uint8_t flags) +{ + memcpy(ctx->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&ctx->chunk, key, flags); + ctx->cv_stack_len = 0; + ctx->ops = blake3_impl_get_ops(); +} + +/* + * As described in hasher_push_cv() below, we do "lazy merging", delaying + * merges until right before the next CV is about to be added. This is + * different from the reference implementation. Another difference is that we + * aren't always merging 1 chunk at a time. Instead, each CV might represent + * any power-of-two number of chunks, as long as the smaller-above-larger + * stack order is maintained. Instead of the "count the trailing 0-bits" + * algorithm described in the spec, we use a "count the total number of + * 1-bits" variant that doesn't require us to retain the subtree size of the + * CV on top of the stack. The principle is the same: each CV that should + * remain in the stack is represented by a 1-bit in the total number of chunks + * (or bytes) so far. + */ +static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len) +{ + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (ctx->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = + parent_output(parent_node, ctx->key, ctx->chunk.flags); + output_chaining_value(ctx->ops, &output, parent_node); + ctx->cv_stack_len -= 1; + } +} + +/* + * In reference_impl.rs, we merge the new CV with existing CVs from the stack + * before pushing it. We can do that because we know more input is coming, so + * we know none of the merges are root. + * + * This setting is different. We want to feed as much input as possible to + * compress_subtree_wide(), without setting aside anything for the chunk_state. + * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once + * as a single subtree, if at all possible. + * + * This leads to two problems: + * 1) This 64 KiB input might be the only call that ever gets made to update. + * In this case, the root node of the 64 KiB subtree would be the root node + * of the whole tree, and it would need to be ROOT finalized. We can't + * compress it until we know. + * 2) This 64 KiB input might complete a larger tree, whose root node is + * similarly going to be the the root of the whole tree. For example, maybe + * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the + * node at the root of the 256 KiB subtree until we know how to finalize it. + * + * The second problem is solved with "lazy merging". That is, when we're about + * to add a CV to the stack, we don't merge it with anything first, as the + * reference impl does. Instead we do merges using the *previous* CV that was + * added, which is sitting on top of the stack, and we put the new CV + * (unmerged) on top of the stack afterwards. This guarantees that we never + * merge the root node until finalize(). + * + * Solving the first problem requires an additional tool, + * compress_subtree_to_parent_node(). That function always returns the top + * *two* chaining values of the subtree it's compressing. We then do lazy + * merging with each of them separately, so that the second CV will always + * remain unmerged. (That also helps us support extendable output when we're + * hashing an input all-at-once.) + */ +static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) +{ + hasher_merge_cv_stack(ctx, chunk_counter); + memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + ctx->cv_stack_len += 1; +} + +void +Blake3_Init(BLAKE3_CTX *ctx) +{ + hasher_init_base(ctx, BLAKE3_IV, 0); +} + +void +Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]) +{ + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(ctx, key_words, KEYED_HASH); +} + +static void +Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector<uint8_t> v; + * blake3_hasher_update(&hasher, v.data(), v.size()); + */ + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + /* + * If we have some partial chunk bytes in the internal chunk_state, we + * need to finish that chunk first. + */ + if (chunk_state_len(&ctx->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + /* + * If we've filled the current chunk and there's more coming, + * finalize this chunk and proceed. In this case we know it's + * not the root. + */ + if (input_len > 0) { + output_t output = chunk_state_output(&ctx->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(ctx->ops, &output, chunk_cv); + hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter); + chunk_state_reset(&ctx->chunk, ctx->key, + ctx->chunk.chunk_counter + 1); + } else { + return; + } + } + + /* + * Now the chunk_state is clear, and we have more input. If there's + * more than a single chunk (so, definitely not the root chunk), hash + * the largest whole subtree we can, with the full benefits of SIMD + * (and maybe in the future, multi-threading) parallelism. Two + * restrictions: + * - The subtree has to be a power-of-2 number of chunks. Only + * subtrees along the right edge can be incomplete, and we don't know + * where the right edge is going to be until we get to finalize(). + * - The subtree must evenly divide the total number of chunks up + * until this point (if total is not 0). If the current incomplete + * subtree is only waiting for 1 more chunk, we can't hash a subtree + * of 4 chunks. We have to complete the current subtree first. + * Because we might need to break up the input to form powers of 2, or + * to evenly divide what we already have, this part runs in a loop. + */ + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = + ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + /* + * Shrink the subtree_len until it evenly divides the count so + * far. We know that subtree_len itself is a power of 2, so we + * can use a bitmasking trick instead of an actual remainder + * operation. (Note that if the caller consistently passes + * power-of-2 inputs of the same size, as is hopefully + * typical, this loop condition will always fail, and + * subtree_len will always be the full length of the input.) + * + * An aside: We don't have to shrink subtree_len quite this + * much. For example, if count_so_far is 1, we could pass 2 + * chunks to compress_subtree_to_parent_node. Since we'll get + * 2 CVs back, we'll still get the right answer in the end, + * and we might get to use 2-way SIMD parallelism. The problem + * with this optimization, is that it gets us stuck always + * hashing 2 chunks. The total number of chunks will remain + * odd, and we'll never graduate to higher degrees of + * parallelism. See + * https://github.com/BLAKE3-team/BLAKE3/issues/69. + */ + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + /* + * The shrunken subtree_len might now be 1 chunk long. If so, + * hash that one chunk by itself. Otherwise, compress the + * subtree into a pair of CVs. + */ + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, ctx->key, + ctx->chunk.flags); + chunk_state.chunk_counter = ctx->chunk.chunk_counter; + chunk_state_update(ctx->ops, &chunk_state, input_bytes, + subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(ctx->ops, &output, cv); + hasher_push_cv(ctx, cv, chunk_state.chunk_counter); + } else { + /* + * This is the high-performance happy path, though + * getting here depends on the caller giving us a long + * enough input. + */ + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(ctx->ops, input_bytes, + subtree_len, ctx->key, ctx-> chunk.chunk_counter, + ctx->chunk.flags, cv_pair); + hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter); + hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN], + ctx->chunk.chunk_counter + (subtree_chunks / 2)); + } + ctx->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + /* + * If there's any remaining input less than a full chunk, add it to + * the chunk state. In that case, also do a final merge loop to make + * sure the subtree stack doesn't contain any unmerged pairs. The + * remaining input means we know these merges are non-root. This merge + * loop isn't strictly necessary here, because hasher_push_chunk_cv + * already does its own merge loop, but it simplifies + * blake3_hasher_finalize below. + */ + if (input_len > 0) { + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, + input_len); + hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter); + } +} + +void +Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo) +{ + size_t done = 0; + const uint8_t *data = input; + const size_t block_max = 1024 * 64; + + /* max feed buffer to leave the stack size small */ + while (todo != 0) { + size_t block = (todo >= block_max) ? block_max : todo; + Blake3_Update2(ctx, data + done, block); + done += block; + todo -= block; + } +} + +void +Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out) +{ + Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN); +} + +void +Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, + size_t out_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector<uint8_t> v; + * blake3_hasher_finalize(&hasher, v.data(), v.size()); + */ + if (out_len == 0) { + return; + } + /* If the subtree stack is empty, then the current chunk is the root. */ + if (ctx->cv_stack_len == 0) { + output_t output = chunk_state_output(&ctx->chunk); + output_root_bytes(ctx->ops, &output, seek, out, out_len); + return; + } + /* + * If there are any bytes in the chunk state, finalize that chunk and + * do a roll-up merge between that chunk hash and every subtree in the + * stack. In this case, the extra merge loop at the end of + * blake3_hasher_update guarantees that none of the subtrees in the + * stack need to be merged with each other first. Otherwise, if there + * are no bytes in the chunk state, then the top of the stack is a + * chunk hash, and we start the merge from that. + */ + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&ctx->chunk) > 0) { + cvs_remaining = ctx->cv_stack_len; + output = chunk_state_output(&ctx->chunk); + } else { + /* There are always at least 2 CVs in the stack in this case. */ + cvs_remaining = ctx->cv_stack_len - 2; + output = parent_output(&ctx->cv_stack[cvs_remaining * 32], + ctx->key, ctx->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(ctx->ops, &output, &parent_block[32]); + output = parent_output(parent_block, ctx->key, + ctx->chunk.flags); + } + output_root_bytes(ctx->ops, &output, seek, out, out_len); +} diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c new file mode 100644 index 000000000..6ff9a845c --- /dev/null +++ b/module/icp/algs/blake3/blake3_generic.c @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include "blake3_impl.h" + +#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) +{ + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +static inline void round_fn(uint32_t state[16], const uint32_t *msg, + size_t round) +{ + /* Select the message schedule based on the round. */ + const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round]; + + /* Mix the columns. */ + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + /* Mix the rows. */ + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +static inline void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = BLAKE3_IV[0]; + state[9] = BLAKE3_IV[1]; + state[10] = BLAKE3_IV[2]; + state[11] = BLAKE3_IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +static inline void blake3_compress_in_place_generic(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +static inline void hash_one_generic(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) +{ + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN, + counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +static inline void blake3_compress_xof_generic(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +static inline void blake3_hash_many_generic(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, + boolean_t increment_counter, uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) +{ + while (num_inputs > 0) { + hash_one_generic(inputs[0], blocks, key, counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + +static inline boolean_t blake3_is_generic_supported(void) +{ + return (B_TRUE); +} + +const blake3_impl_ops_t blake3_generic_impl = { + .compress_in_place = blake3_compress_in_place_generic, + .compress_xof = blake3_compress_xof_generic, + .hash_many = blake3_hash_many_generic, + .is_supported = blake3_is_generic_supported, + .degree = 4, + .name = "generic" +}; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c new file mode 100644 index 000000000..c3268ec13 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.c @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include <sys/zfs_context.h> +#include <sys/zio_checksum.h> + +#include "blake3_impl.h" + +static const blake3_impl_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; + +/* this pointer holds current ops for implementation */ +static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; + +/* special implementation selections */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) +#define IMPL_USER (UINT32_MAX-2) +#define IMPL_PARAM (UINT32_MAX-3) + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) +static uint32_t icp_blake3_impl = IMPL_FASTEST; + +#define BLAKE3_IMPL_NAME_MAX 16 + +/* id of fastest implementation */ +static uint32_t blake3_fastest_id = 0; + +/* currently used id */ +static uint32_t blake3_current_id = 0; + +/* id of module parameter (-1 == unused) */ +static int blake3_param_id = -1; + +/* return number of supported implementations */ +int +blake3_get_impl_count(void) +{ + static int impls = 0; + int i; + + if (impls) + return (impls); + + for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + impls++; + } + + return (impls); +} + +/* return id of selected implementation */ +int +blake3_get_impl_id(void) +{ + return (blake3_current_id); +} + +/* return name of selected implementation */ +const char * +blake3_get_impl_name(void) +{ + return (blake3_selected_impl->name); +} + +/* setup id as fastest implementation */ +void +blake3_set_impl_fastest(uint32_t id) +{ + blake3_fastest_id = id; +} + +/* set implementation by id */ +void +blake3_set_impl_id(uint32_t id) +{ + int i, cid; + + /* select fastest */ + if (id == IMPL_FASTEST) + id = blake3_fastest_id; + + /* select next or first */ + if (id == IMPL_CYCLE) + id = (++blake3_current_id) % blake3_get_impl_count(); + + /* 0..N for the real impl */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (cid == id) { + blake3_current_id = cid; + blake3_selected_impl = blake3_impls[i]; + return; + } + cid++; + } +} + +/* set implementation by name */ +int +blake3_set_impl_name(const char *name) +{ + int i, cid; + + if (strcmp(name, "fastest") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); + blake3_set_impl_id(IMPL_FASTEST); + return (0); + } else if (strcmp(name, "cycle") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); + blake3_set_impl_id(IMPL_CYCLE); + return (0); + } + + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (strcmp(name, blake3_impls[i]->name) == 0) { + if (icp_blake3_impl == IMPL_PARAM) { + blake3_param_id = cid; + return (0); + } + blake3_selected_impl = blake3_impls[i]; + blake3_current_id = cid; + return (0); + } + cid++; + } + + return (-EINVAL); +} + +/* setup implementation */ +void +blake3_setup_impl(void) +{ + switch (IMPL_READ(icp_blake3_impl)) { + case IMPL_PARAM: + blake3_set_impl_id(blake3_param_id); + atomic_swap_32(&icp_blake3_impl, IMPL_USER); + break; + case IMPL_FASTEST: + blake3_set_impl_id(IMPL_FASTEST); + break; + case IMPL_CYCLE: + blake3_set_impl_id(IMPL_CYCLE); + break; + default: + blake3_set_impl_id(blake3_current_id); + break; + } +} + +/* return selected implementation */ +const blake3_impl_ops_t * +blake3_impl_get_ops(void) +{ + /* each call to ops will cycle */ + if (icp_blake3_impl == IMPL_CYCLE) + blake3_set_impl_id(IMPL_CYCLE); + + return (blake3_selected_impl); +} + +#if defined(_KERNEL) && defined(__linux__) +static int +icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) +{ + char req_name[BLAKE3_IMPL_NAME_MAX]; + size_t i; + + /* sanitize input */ + i = strnlen(name, BLAKE3_IMPL_NAME_MAX); + if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) + return (-EINVAL); + + strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); + while (i > 0 && isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); + return (blake3_set_impl_name(req_name)); +} + +static int +icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cid, cnt = 0; + char *fmt; + + /* cycling */ + fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; + cnt += sprintf(buffer + cnt, fmt); + + /* fastest one */ + fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; + cnt += sprintf(buffer + cnt, fmt); + + /* user selected */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + fmt = (icp_blake3_impl == IMPL_USER && + cid == blake3_current_id) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); + cid++; + } + + buffer[cnt] = 0; + + return (cnt); +} + +module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, + NULL, 0644); +MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +#endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h new file mode 100644 index 000000000..7b40cc4d3 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/blake3.h> +#include <sys/simd.h> + +/* + * Methods used to define BLAKE3 assembler implementations + */ +typedef void (*blake3_compress_in_place_f)(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +typedef void (*blake3_compress_xof_f)(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +typedef boolean_t (*blake3_is_supported_f)(void); + +typedef struct blake3_impl_ops { + blake3_compress_in_place_f compress_in_place; + blake3_compress_xof_f compress_xof; + blake3_hash_many_f hash_many; + blake3_is_supported_f is_supported; + int degree; + const char *name; +} blake3_impl_ops_t; + +/* Return selected BLAKE3 implementation ops */ +extern const blake3_impl_ops_t *blake3_impl_get_ops(void); + +extern const blake3_impl_ops_t blake3_generic_impl; + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse2_impl; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse41_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern const blake3_impl_ops_t blake3_avx2_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern const blake3_impl_ops_t blake3_avx512_impl; +#endif + +#if defined(__x86_64) +#define MAX_SIMD_DEGREE 16 +#else +#define MAX_SIMD_DEGREE 4 +#endif + +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t BLAKE3_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +static inline unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (63 ^ __builtin_clzll(x)); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return (index); +#elif defined(_MSC_VER) && defined(IS_X86_32) + if (x >> 32) { + unsigned long index; + _BitScanReverse(&index, x >> 32); + return (32 + index); + } else { + unsigned long index; + _BitScanReverse(&index, x); + return (index); + } +#else + unsigned int c = 0; + if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if (x & 0x0000000000000002ULL) { c += 1; } + return (c); +#endif +} + +/* Count the number of 1 bits. */ +static inline unsigned int popcnt(uint64_t x) { + unsigned int count = 0; + + while (x != 0) { + count += 1; + x &= x - 1; + } + + return (count); +} + +/* + * Largest power of two less than or equal to x. + * As a special case, returns 1 when x is 0. + */ +static inline uint64_t round_down_to_power_of_2(uint64_t x) { + return (1ULL << highest_one(x | 1)); +} + +static inline uint32_t counter_low(uint64_t counter) { + return ((uint32_t)counter); +} + +static inline uint32_t counter_high(uint64_t counter) { + return ((uint32_t)(counter >> 32)); +} + +static inline uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +static inline void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_IMPL_H */ diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c new file mode 100644 index 000000000..48715e212 --- /dev/null +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt <[email protected]> + */ + +#include "blake3_impl.h" + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse2_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse41_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx2_supported(void) +{ + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); +} + +const blake3_impl_ops_t blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx512_supported(void) +{ + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); +} + +const blake3_impl_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S new file mode 100644 index 000000000..59a4d9afd --- /dev/null +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -0,0 +1,2450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE2 -> ARMv8-A + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if defined(__aarch64__) + .text + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI0_1: + .xword 0 + .xword -4294967296 +.LCPI0_2: + .xword -1 + .xword 4294967295 + .text + .globl zfs_blake3_compress_in_place_sse2 + .p2align 2 + .type zfs_blake3_compress_in_place_sse2,@function +zfs_blake3_compress_in_place_sse2: + .cfi_startproc + ldp q3, q2, [x0] + ldp q5, q6, [x1] + add x10, x1, #32 + lsr x11, x3, #32 + fmov s4, w3 + ld2 { v17.4s, v18.4s }, [x10] + adrp x10, .LCPI0_2 + and w8, w2, #0xff + mov v4.s[1], w11 + ldr q1, [x10, :lo12:.LCPI0_2] + and w9, w4, #0xff + adrp x12, .LCPI0_0 + mov v4.s[2], w8 + uzp1 v19.4s, v5.4s, v6.4s + add v3.4s, v2.4s, v3.4s + ldr q7, [x12, :lo12:.LCPI0_0] + mov v4.s[3], w9 + add v3.4s, v3.4s, v19.4s + uzp2 v5.4s, v5.4s, v6.4s + ext v21.16b, v18.16b, v18.16b, #12 + uzp1 v6.4s, v19.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #12 + eor v4.16b, v3.16b, v4.16b + ext v20.16b, v17.16b, v17.16b, #12 + ext v6.16b, v6.16b, v19.16b, #8 + ext v19.16b, v19.16b, v22.16b, #12 + zip1 v22.2d, v21.2d, v5.2d + rev32 v24.8h, v4.8h + mov v4.16b, v1.16b + zip2 v23.4s, v5.4s, v21.4s + uzp2 v6.4s, v6.4s, v5.4s + bsl v4.16b, v22.16b, v20.16b + add v3.4s, v3.4s, v5.4s + zip1 v5.4s, v23.4s, v20.4s + zip1 v22.4s, v20.4s, v23.4s + add v23.4s, v24.4s, v7.4s + ext v7.16b, v6.16b, v6.16b, #4 + ext v25.16b, v4.16b, v4.16b, #12 + ext v5.16b, v22.16b, v5.16b, #8 + eor v2.16b, v23.16b, v2.16b + uzp1 v4.4s, v4.4s, v25.4s + uzp1 v22.4s, v7.4s, v7.4s + ext v25.16b, v7.16b, v7.16b, #12 + ext v22.16b, v22.16b, v7.16b, #8 + ext v7.16b, v7.16b, v25.16b, #12 + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + add v3.4s, v3.4s, v2.4s + eor v24.16b, v3.16b, v24.16b + add v3.4s, v3.4s, v17.4s + ushr v17.4s, v24.4s, #8 + shl v18.4s, v24.4s, #24 + orr v17.16b, v18.16b, v17.16b + add v18.4s, v17.4s, v23.4s + eor v2.16b, v18.16b, v2.16b + ushr v23.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v3.16b, v3.16b, v3.16b, #12 + orr v2.16b, v2.16b, v23.16b + ext v17.16b, v17.16b, v17.16b, #8 + add v3.4s, v2.4s, v3.4s + adrp x11, .LCPI0_1 + eor v17.16b, v3.16b, v17.16b + ldr q16, [x11, :lo12:.LCPI0_1] + ext v18.16b, v18.16b, v18.16b, #4 + rev32 v24.8h, v17.8h + movi v0.2d, #0xffffffff00000000 + add v23.4s, v3.4s, v21.4s + mov v21.s[1], v20.s[2] + add v20.4s, v18.4s, v24.4s + bit v19.16b, v21.16b, v0.16b + eor v3.16b, v20.16b, v2.16b + uzp2 v2.4s, v22.4s, v19.4s + zip1 v17.2d, v5.2d, v19.2d + zip2 v18.4s, v19.4s, v5.4s + ushr v21.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + ext v22.16b, v2.16b, v2.16b, #4 + bsl v16.16b, v4.16b, v17.16b + zip1 v17.4s, v18.4s, v4.4s + zip1 v18.4s, v4.4s, v18.4s + orr v21.16b, v3.16b, v21.16b + ext v25.16b, v16.16b, v16.16b, #12 + ext v3.16b, v18.16b, v17.16b, #8 + uzp1 v18.4s, v22.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #12 + add v23.4s, v23.4s, v21.4s + uzp1 v17.4s, v16.4s, v25.4s + ext v16.16b, v18.16b, v22.16b, #8 + ext v18.16b, v22.16b, v26.16b, #12 + eor v22.16b, v23.16b, v24.16b + add v6.4s, v23.4s, v6.4s + ushr v23.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v23.16b + add v20.4s, v22.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ushr v23.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v21.16b, v21.16b, v23.16b + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v21.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v20.16b, v20.16b, v20.16b, #12 + add v6.4s, v6.4s, v19.4s + rev32 v19.8h, v22.8h + add v20.4s, v20.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ushr v22.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v22.16b + add v20.4s, v19.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #12 + ushr v22.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v21.16b, v21.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ext v20.16b, v20.16b, v20.16b, #4 + rev32 v19.8h, v19.8h + add v20.4s, v20.4s, v19.4s + add v6.4s, v6.4s, v5.4s + mov v5.s[1], v4.s[2] + eor v4.16b, v20.16b, v21.16b + ushr v21.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v21.16b, v4.16b, v21.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + add v2.4s, v6.4s, v2.4s + ushr v6.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v6.16b, v19.16b, v6.16b + add v19.4s, v6.4s, v20.4s + eor v20.16b, v19.16b, v21.16b + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v20.4s, v2.4s + eor v6.16b, v2.16b, v6.16b + ext v19.16b, v19.16b, v19.16b, #12 + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v6.4s + mov v22.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + bsl v22.16b, v5.16b, v7.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + add v2.4s, v2.4s, v22.4s + orr v20.16b, v20.16b, v21.16b + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + ushr v21.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v6.16b, v6.16b, v21.16b + add v19.4s, v6.4s, v19.4s + eor v20.16b, v19.16b, v20.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v2.4s, v2.4s, v17.4s + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + uzp2 v5.4s, v16.4s, v22.4s + zip1 v7.2d, v3.2d, v22.2d + zip2 v16.4s, v22.4s, v3.4s + ext v19.16b, v19.16b, v19.16b, #4 + rev32 v22.8h, v6.8h + ext v23.16b, v5.16b, v5.16b, #4 + bif v7.16b, v17.16b, v1.16b + zip1 v24.4s, v16.4s, v17.4s + zip1 v16.4s, v17.4s, v16.4s + add v21.4s, v2.4s, v3.4s + mov v3.s[1], v17.s[2] + add v17.4s, v19.4s, v22.4s + mov v19.16b, v0.16b + ext v25.16b, v7.16b, v7.16b, #12 + ext v4.16b, v16.16b, v24.16b, #8 + uzp1 v16.4s, v23.4s, v23.4s + bsl v19.16b, v3.16b, v18.16b + eor v2.16b, v17.16b, v20.16b + uzp1 v7.4s, v7.4s, v25.4s + ext v25.16b, v16.16b, v23.16b, #8 + zip1 v3.2d, v4.2d, v19.2d + ushr v20.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ext v24.16b, v23.16b, v23.16b, #12 + uzp2 v6.4s, v25.4s, v19.4s + zip2 v18.4s, v19.4s, v4.4s + bif v3.16b, v7.16b, v1.16b + orr v20.16b, v2.16b, v20.16b + ext v16.16b, v23.16b, v24.16b, #12 + ext v23.16b, v6.16b, v6.16b, #4 + zip1 v24.4s, v18.4s, v7.4s + zip1 v18.4s, v7.4s, v18.4s + ext v25.16b, v3.16b, v3.16b, #12 + add v21.4s, v21.4s, v20.4s + ext v2.16b, v18.16b, v24.16b, #8 + uzp1 v18.4s, v23.4s, v23.4s + ext v24.16b, v23.16b, v23.16b, #12 + uzp1 v3.4s, v3.4s, v25.4s + eor v22.16b, v21.16b, v22.16b + ext v25.16b, v18.16b, v23.16b, #8 + dup v18.4s, v2.s[3] + ext v23.16b, v23.16b, v24.16b, #12 + add v5.4s, v21.4s, v5.4s + trn1 v21.4s, v3.4s, v3.4s + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v18.16b, v21.16b, v18.16b, #8 + orr v21.16b, v22.16b, v24.16b + add v17.4s, v21.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ushr v22.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v20.16b, v22.16b + ext v21.16b, v21.16b, v21.16b, #8 + add v5.4s, v20.4s, v5.4s + eor v21.16b, v5.16b, v21.16b + ext v17.16b, v17.16b, v17.16b, #12 + add v5.4s, v5.4s, v19.4s + rev32 v19.8h, v21.8h + add v17.4s, v17.4s, v19.4s + eor v20.16b, v17.16b, v20.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v21.16b + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ushr v21.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v21.16b + add v17.4s, v19.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v5.4s, v5.4s, v7.4s + orr v20.16b, v20.16b, v21.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ext v17.16b, v17.16b, v17.16b, #4 + rev32 v22.8h, v19.8h + add v21.4s, v5.4s, v4.4s + mov v4.s[1], v7.s[2] + add v19.4s, v17.4s, v22.4s + bit v16.16b, v4.16b, v0.16b + eor v5.16b, v19.16b, v20.16b + uzp2 v4.4s, v25.4s, v16.4s + zip1 v7.2d, v2.2d, v16.2d + zip2 v17.4s, v16.4s, v2.4s + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v24.16b, v4.16b, v4.16b, #4 + bif v7.16b, v3.16b, v1.16b + zip1 v25.4s, v17.4s, v3.4s + zip1 v17.4s, v3.4s, v17.4s + orr v20.16b, v5.16b, v20.16b + ext v26.16b, v7.16b, v7.16b, #12 + ext v5.16b, v17.16b, v25.16b, #8 + uzp1 v17.4s, v24.4s, v24.4s + ext v25.16b, v24.16b, v24.16b, #12 + bit v23.16b, v18.16b, v0.16b + add v21.4s, v21.4s, v20.4s + uzp1 v7.4s, v7.4s, v26.4s + ext v26.16b, v17.16b, v24.16b, #8 + ext v17.16b, v24.16b, v25.16b, #12 + eor v22.16b, v21.16b, v22.16b + add v6.4s, v21.4s, v6.4s + zip1 v21.2d, v5.2d, v23.2d + zip2 v24.4s, v23.4s, v5.4s + bif v21.16b, v7.16b, v1.16b + zip1 v1.4s, v24.4s, v7.4s + zip1 v24.4s, v7.4s, v24.4s + ext v1.16b, v24.16b, v1.16b, #8 + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v24.16b + add v19.4s, v22.4s, v19.4s + ext v24.16b, v21.16b, v21.16b, #12 + eor v20.16b, v19.16b, v20.16b + uzp1 v21.4s, v21.4s, v24.4s + ushr v24.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v20.16b, v20.16b, v24.16b + ext v6.16b, v6.16b, v6.16b, #4 + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v20.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #12 + add v6.4s, v6.4s, v16.4s + rev32 v16.8h, v22.8h + add v19.4s, v19.4s, v16.4s + eor v20.16b, v19.16b, v20.16b + ushr v22.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v22.16b + add v6.4s, v6.4s, v20.4s + eor v16.16b, v6.16b, v16.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v6.4s, v3.4s + ushr v6.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v6.16b, v16.16b, v6.16b + add v16.4s, v6.4s, v19.4s + eor v19.16b, v16.16b, v20.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v20.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v3.4s, v3.4s, v19.4s + eor v6.16b, v3.16b, v6.16b + ext v16.16b, v16.16b, v16.16b, #4 + add v2.4s, v3.4s, v2.4s + rev32 v3.8h, v6.8h + add v6.4s, v16.4s, v3.4s + eor v16.16b, v6.16b, v19.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + orr v16.16b, v16.16b, v19.16b + add v2.4s, v2.4s, v16.4s + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v4.4s + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v4.16b + add v4.4s, v3.4s, v6.4s + eor v6.16b, v4.16b, v16.16b + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v6.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v6.16b, v4.16b, v6.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v2.4s, v2.4s, v23.4s + orr v6.16b, v6.16b, v16.16b + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + add v4.4s, v3.4s, v4.4s + eor v6.16b, v4.16b, v6.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v2.4s, v2.4s, v7.4s + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #4 + rev32 v3.8h, v3.8h + add v2.4s, v2.4s, v5.4s + mov v5.s[1], v7.s[2] + add v4.4s, v4.4s, v3.4s + bsl v0.16b, v5.16b, v17.16b + eor v5.16b, v4.16b, v6.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v6.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v6.16b + add v4.4s, v3.4s, v4.4s + uzp2 v18.4s, v26.4s, v18.4s + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v18.4s + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v5.16b, v5.16b, v6.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v5.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v0.4s, v2.4s, v0.4s + rev32 v2.8h, v3.8h + add v3.4s, v4.4s, v2.4s + eor v4.16b, v3.16b, v5.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v5.16b + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ushr v5.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v2.16b, v2.16b, v5.16b + add v3.4s, v2.4s, v3.4s + eor v4.16b, v3.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #12 + ushr v5.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v0.4s, v0.4s, v21.4s + orr v4.16b, v4.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ext v3.16b, v3.16b, v3.16b, #4 + add v0.4s, v0.4s, v1.4s + rev32 v1.8h, v2.8h + add v2.4s, v3.4s, v1.4s + eor v3.16b, v2.16b, v4.16b + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v3.4s + eor v1.16b, v0.16b, v1.16b + ushr v4.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + orr v1.16b, v1.16b, v4.16b + add v2.4s, v1.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v0.16b, v0.16b, v0.16b, #4 + ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v1.16b, v1.16b, v1.16b, #8 + eor v0.16b, v2.16b, v0.16b + orr v2.16b, v3.16b, v4.16b + eor v1.16b, v2.16b, v1.16b + stp q0, q1, [x0] + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI1_1: + .xword 0 + .xword -4294967296 +.LCPI1_2: + .xword -1 + .xword 4294967295 + .text + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: + .cfi_startproc + ldp q3, q2, [x0] + ldp q5, q6, [x1] + add x10, x1, #32 + lsr x11, x3, #32 + fmov s4, w3 + ld2 { v17.4s, v18.4s }, [x10] + adrp x10, .LCPI1_2 + and w8, w2, #0xff + mov v4.s[1], w11 + ldr q1, [x10, :lo12:.LCPI1_2] + and w9, w4, #0xff + adrp x12, .LCPI1_0 + mov v4.s[2], w8 + uzp1 v19.4s, v5.4s, v6.4s + add v3.4s, v2.4s, v3.4s + ldr q7, [x12, :lo12:.LCPI1_0] + mov v4.s[3], w9 + add v3.4s, v3.4s, v19.4s + uzp2 v5.4s, v5.4s, v6.4s + ext v21.16b, v18.16b, v18.16b, #12 + uzp1 v6.4s, v19.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #12 + eor v4.16b, v3.16b, v4.16b + ext v20.16b, v17.16b, v17.16b, #12 + ext v6.16b, v6.16b, v19.16b, #8 + ext v19.16b, v19.16b, v22.16b, #12 + zip1 v22.2d, v21.2d, v5.2d + rev32 v24.8h, v4.8h + mov v4.16b, v1.16b + zip2 v23.4s, v5.4s, v21.4s + uzp2 v6.4s, v6.4s, v5.4s + bsl v4.16b, v22.16b, v20.16b + add v3.4s, v3.4s, v5.4s + zip1 v5.4s, v23.4s, v20.4s + zip1 v22.4s, v20.4s, v23.4s + add v23.4s, v24.4s, v7.4s + ext v7.16b, v6.16b, v6.16b, #4 + ext v25.16b, v4.16b, v4.16b, #12 + ext v5.16b, v22.16b, v5.16b, #8 + eor v2.16b, v23.16b, v2.16b + uzp1 v4.4s, v4.4s, v25.4s + uzp1 v22.4s, v7.4s, v7.4s + ext v25.16b, v7.16b, v7.16b, #12 + ext v22.16b, v22.16b, v7.16b, #8 + ext v7.16b, v7.16b, v25.16b, #12 + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + add v3.4s, v3.4s, v2.4s + eor v24.16b, v3.16b, v24.16b + add v3.4s, v3.4s, v17.4s + ushr v17.4s, v24.4s, #8 + shl v18.4s, v24.4s, #24 + orr v17.16b, v18.16b, v17.16b + add v18.4s, v17.4s, v23.4s + eor v2.16b, v18.16b, v2.16b + ushr v23.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v3.16b, v3.16b, v3.16b, #12 + orr v2.16b, v2.16b, v23.16b + ext v17.16b, v17.16b, v17.16b, #8 + add v3.4s, v2.4s, v3.4s + adrp x11, .LCPI1_1 + eor v17.16b, v3.16b, v17.16b + ldr q16, [x11, :lo12:.LCPI1_1] + ext v18.16b, v18.16b, v18.16b, #4 + rev32 v24.8h, v17.8h + movi v0.2d, #0xffffffff00000000 + add v23.4s, v3.4s, v21.4s + mov v21.s[1], v20.s[2] + add v20.4s, v18.4s, v24.4s + bit v19.16b, v21.16b, v0.16b + eor v3.16b, v20.16b, v2.16b + uzp2 v2.4s, v22.4s, v19.4s + zip1 v17.2d, v5.2d, v19.2d + zip2 v18.4s, v19.4s, v5.4s + ushr v21.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + ext v22.16b, v2.16b, v2.16b, #4 + bsl v16.16b, v4.16b, v17.16b + zip1 v17.4s, v18.4s, v4.4s + zip1 v18.4s, v4.4s, v18.4s + orr v21.16b, v3.16b, v21.16b + ext v25.16b, v16.16b, v16.16b, #12 + ext v3.16b, v18.16b, v17.16b, #8 + uzp1 v18.4s, v22.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #12 + add v23.4s, v23.4s, v21.4s + uzp1 v17.4s, v16.4s, v25.4s + ext v16.16b, v18.16b, v22.16b, #8 + ext v18.16b, v22.16b, v26.16b, #12 + eor v22.16b, v23.16b, v24.16b + add v6.4s, v23.4s, v6.4s + ushr v23.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v23.16b + add v20.4s, v22.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ushr v23.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v21.16b, v21.16b, v23.16b + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v21.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v20.16b, v20.16b, v20.16b, #12 + add v6.4s, v6.4s, v19.4s + rev32 v19.8h, v22.8h + add v20.4s, v20.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ushr v22.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v22.16b + add v20.4s, v19.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #12 + ushr v22.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v21.16b, v21.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ext v20.16b, v20.16b, v20.16b, #4 + rev32 v19.8h, v19.8h + add v20.4s, v20.4s, v19.4s + add v6.4s, v6.4s, v5.4s + mov v5.s[1], v4.s[2] + eor v4.16b, v20.16b, v21.16b + ushr v21.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v21.16b, v4.16b, v21.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + add v2.4s, v6.4s, v2.4s + ushr v6.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v6.16b, v19.16b, v6.16b + add v19.4s, v6.4s, v20.4s + eor v20.16b, v19.16b, v21.16b + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v20.4s, v2.4s + eor v6.16b, v2.16b, v6.16b + ext v19.16b, v19.16b, v19.16b, #12 + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v6.4s + mov v22.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + bsl v22.16b, v5.16b, v7.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + add v2.4s, v2.4s, v22.4s + orr v20.16b, v20.16b, v21.16b + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + ushr v21.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v6.16b, v6.16b, v21.16b + add v19.4s, v6.4s, v19.4s + eor v20.16b, v19.16b, v20.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v2.4s, v2.4s, v17.4s + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + uzp2 v5.4s, v16.4s, v22.4s + zip1 v7.2d, v3.2d, v22.2d + zip2 v16.4s, v22.4s, v3.4s + ext v19.16b, v19.16b, v19.16b, #4 + rev32 v22.8h, v6.8h + ext v23.16b, v5.16b, v5.16b, #4 + bif v7.16b, v17.16b, v1.16b + zip1 v24.4s, v16.4s, v17.4s + zip1 v16.4s, v17.4s, v16.4s + add v21.4s, v2.4s, v3.4s + mov v3.s[1], v17.s[2] + add v17.4s, v19.4s, v22.4s + mov v19.16b, v0.16b + ext v25.16b, v7.16b, v7.16b, #12 + ext v4.16b, v16.16b, v24.16b, #8 + uzp1 v16.4s, v23.4s, v23.4s + bsl v19.16b, v3.16b, v18.16b + eor v2.16b, v17.16b, v20.16b + uzp1 v7.4s, v7.4s, v25.4s + ext v25.16b, v16.16b, v23.16b, #8 + zip1 v3.2d, v4.2d, v19.2d + ushr v20.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ext v24.16b, v23.16b, v23.16b, #12 + uzp2 v6.4s, v25.4s, v19.4s + zip2 v18.4s, v19.4s, v4.4s + bif v3.16b, v7.16b, v1.16b + orr v20.16b, v2.16b, v20.16b + ext v16.16b, v23.16b, v24.16b, #12 + ext v23.16b, v6.16b, v6.16b, #4 + zip1 v24.4s, v18.4s, v7.4s + zip1 v18.4s, v7.4s, v18.4s + ext v25.16b, v3.16b, v3.16b, #12 + add v21.4s, v21.4s, v20.4s + ext v2.16b, v18.16b, v24.16b, #8 + uzp1 v18.4s, v23.4s, v23.4s + ext v24.16b, v23.16b, v23.16b, #12 + uzp1 v3.4s, v3.4s, v25.4s + eor v22.16b, v21.16b, v22.16b + ext v25.16b, v18.16b, v23.16b, #8 + dup v18.4s, v2.s[3] + ext v23.16b, v23.16b, v24.16b, #12 + add v5.4s, v21.4s, v5.4s + trn1 v21.4s, v3.4s, v3.4s + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v18.16b, v21.16b, v18.16b, #8 + orr v21.16b, v22.16b, v24.16b + add v17.4s, v21.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ushr v22.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v20.16b, v22.16b + ext v21.16b, v21.16b, v21.16b, #8 + add v5.4s, v20.4s, v5.4s + eor v21.16b, v5.16b, v21.16b + ext v17.16b, v17.16b, v17.16b, #12 + add v5.4s, v5.4s, v19.4s + rev32 v19.8h, v21.8h + add v17.4s, v17.4s, v19.4s + eor v20.16b, v17.16b, v20.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v21.16b + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ushr v21.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v21.16b + add v17.4s, v19.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v5.4s, v5.4s, v7.4s + orr v20.16b, v20.16b, v21.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ext v17.16b, v17.16b, v17.16b, #4 + rev32 v22.8h, v19.8h + add v21.4s, v5.4s, v4.4s + mov v4.s[1], v7.s[2] + add v19.4s, v17.4s, v22.4s + bit v16.16b, v4.16b, v0.16b + eor v5.16b, v19.16b, v20.16b + uzp2 v4.4s, v25.4s, v16.4s + zip1 v7.2d, v2.2d, v16.2d + zip2 v17.4s, v16.4s, v2.4s + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v24.16b, v4.16b, v4.16b, #4 + bif v7.16b, v3.16b, v1.16b + zip1 v25.4s, v17.4s, v3.4s + zip1 v17.4s, v3.4s, v17.4s + orr v20.16b, v5.16b, v20.16b + ext v26.16b, v7.16b, v7.16b, #12 + ext v5.16b, v17.16b, v25.16b, #8 + uzp1 v17.4s, v24.4s, v24.4s + ext v25.16b, v24.16b, v24.16b, #12 + bit v23.16b, v18.16b, v0.16b + add v21.4s, v21.4s, v20.4s + uzp1 v7.4s, v7.4s, v26.4s + ext v26.16b, v17.16b, v24.16b, #8 + ext v17.16b, v24.16b, v25.16b, #12 + eor v22.16b, v21.16b, v22.16b + add v6.4s, v21.4s, v6.4s + zip1 v21.2d, v5.2d, v23.2d + zip2 v24.4s, v23.4s, v5.4s + bif v21.16b, v7.16b, v1.16b + zip1 v1.4s, v24.4s, v7.4s + zip1 v24.4s, v7.4s, v24.4s + ext v1.16b, v24.16b, v1.16b, #8 + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v24.16b + add v19.4s, v22.4s, v19.4s + ext v24.16b, v21.16b, v21.16b, #12 + eor v20.16b, v19.16b, v20.16b + uzp1 v21.4s, v21.4s, v24.4s + ushr v24.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v20.16b, v20.16b, v24.16b + ext v6.16b, v6.16b, v6.16b, #4 + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v20.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #12 + add v6.4s, v6.4s, v16.4s + rev32 v16.8h, v22.8h + add v19.4s, v19.4s, v16.4s + eor v20.16b, v19.16b, v20.16b + ushr v22.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v22.16b + add v6.4s, v6.4s, v20.4s + eor v16.16b, v6.16b, v16.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v6.4s, v3.4s + ushr v6.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v6.16b, v16.16b, v6.16b + add v16.4s, v6.4s, v19.4s + eor v19.16b, v16.16b, v20.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v20.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v3.4s, v3.4s, v19.4s + eor v6.16b, v3.16b, v6.16b + ext v16.16b, v16.16b, v16.16b, #4 + add v2.4s, v3.4s, v2.4s + rev32 v3.8h, v6.8h + add v6.4s, v16.4s, v3.4s + eor v16.16b, v6.16b, v19.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + orr v16.16b, v16.16b, v19.16b + add v2.4s, v2.4s, v16.4s + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v4.4s + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v4.16b + add v4.4s, v3.4s, v6.4s + eor v6.16b, v4.16b, v16.16b + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v6.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v6.16b, v4.16b, v6.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v2.4s, v2.4s, v23.4s + orr v6.16b, v6.16b, v16.16b + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + add v4.4s, v3.4s, v4.4s + eor v6.16b, v4.16b, v6.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v2.4s, v2.4s, v7.4s + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #4 + rev32 v3.8h, v3.8h + add v2.4s, v2.4s, v5.4s + mov v5.s[1], v7.s[2] + add v4.4s, v4.4s, v3.4s + bsl v0.16b, v5.16b, v17.16b + eor v5.16b, v4.16b, v6.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v6.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v6.16b + add v4.4s, v3.4s, v4.4s + uzp2 v18.4s, v26.4s, v18.4s + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v18.4s + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v5.16b, v5.16b, v6.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v5.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v0.4s, v2.4s, v0.4s + rev32 v2.8h, v3.8h + add v3.4s, v4.4s, v2.4s + eor v4.16b, v3.16b, v5.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v5.16b + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ushr v5.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v2.16b, v2.16b, v5.16b + add v3.4s, v2.4s, v3.4s + eor v4.16b, v3.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #12 + ushr v5.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v0.4s, v0.4s, v21.4s + orr v4.16b, v4.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ext v3.16b, v3.16b, v3.16b, #4 + add v0.4s, v0.4s, v1.4s + rev32 v1.8h, v2.8h + add v2.4s, v3.4s, v1.4s + eor v3.16b, v2.16b, v4.16b + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v3.4s + eor v1.16b, v0.16b, v1.16b + ushr v4.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + orr v1.16b, v1.16b, v4.16b + add v2.4s, v1.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v0.16b, v0.16b, v0.16b, #4 + ext v1.16b, v1.16b, v1.16b, #8 + ext v2.16b, v2.16b, v2.16b, #12 + orr v3.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + eor v3.16b, v3.16b, v1.16b + stp q0, q3, [x5] + ldr q0, [x0] + eor v0.16b, v0.16b, v2.16b + str q0, [x5, #32] + ldr q0, [x0, #16] + eor v0.16b, v0.16b, v1.16b + str q0, [x5, #48] + ret +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_0: + .word 0 + .word 1 + .word 2 + .word 3 + .text + .globl zfs_blake3_hash_many_sse2 + .p2align 2 + .type zfs_blake3_hash_many_sse2,@function +zfs_blake3_hash_many_sse2: + .cfi_startproc + stp d15, d14, [sp, #-160]! + stp d13, d12, [sp, #16] + stp d11, d10, [sp, #32] + stp d9, d8, [sp, #48] + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + mov x29, sp + sub sp, sp, #384 + .cfi_def_cfa w29, 160 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + ldr x26, [x29, #168] + ldrb w27, [x29, #160] + mov w19, w6 + mov x20, x4 + mov x22, x2 + mov x28, x1 + cmp x1, #4 + mov x24, x0 + str x3, [sp, #40] + b.lo .LBB2_8 + adrp x9, .LCPI2_0 + ldr q0, [x9, :lo12:.LCPI2_0] + sbfx w11, w5, #0, #1 + dup v1.4s, w11 + mov w9, #58983 + mov w10, #44677 + and v0.16b, v1.16b, v0.16b + mov w11, #62322 + mov w12, #62778 + orr w8, w7, w19 + movk w9, #27145, lsl #16 + movk w10, #47975, lsl #16 + movk w11, #15470, lsl #16 + str q0, [sp, #16] + orr v0.4s, #128, lsl #24 + movk w12, #42319, lsl #16 + str q0, [sp] +.LBB2_2: + ldr x0, [sp, #40] + mov x13, x0 + ld1r { v20.4s }, [x13], #4 + add x14, x0, #8 + add x15, x0, #12 + add x16, x0, #16 + add x17, x0, #20 + add x18, x0, #24 + add x0, x0, #28 + ld1r { v17.4s }, [x14] + ld1r { v6.4s }, [x15] + ld1r { v8.4s }, [x16] + ld1r { v9.4s }, [x17] + ld1r { v31.4s }, [x18] + ld1r { v26.4s }, [x13] + ld1r { v15.4s }, [x0] + cbz x22, .LBB2_7 + ldr q1, [sp, #16] + dup v0.4s, w20 + ldp x13, x14, [x24] + ldp x15, x16, [x24, #16] + add v1.4s, v0.4s, v1.4s + movi v0.4s, #128, lsl #24 + str q1, [sp, #64] + eor v0.16b, v1.16b, v0.16b + ldr q1, [sp] + lsr x18, x20, #32 + mov x17, xzr + cmgt v0.4s, v1.4s, v0.4s + dup v1.4s, w18 + sub v0.4s, v1.4s, v0.4s + mov w18, w8 + str q0, [sp, #48] +.LBB2_4: + mov w2, #16 + bfi x2, x17, #6, #58 + ldr q1, [x13, x2] + ldr q3, [x14, x2] + ldr q2, [x15, x2] + ldr q4, [x16, x2] + mov w2, #32 + bfi x2, x17, #6, #58 + ldr q5, [x13, x2] + ldr q18, [x14, x2] + ldr q19, [x15, x2] + ldr q23, [x16, x2] + mov w2, #48 + lsl x3, x17, #6 + bfi x2, x17, #6, #58 + add x17, x17, #1 + ldr q0, [x13, x3] + ldr q21, [x14, x3] + ldr q7, [x15, x3] + ldr q16, [x16, x3] + cmp x17, x22 + ldr q13, [x13, x2] + ldr q14, [x14, x2] + ldr q29, [x15, x2] + ldr q10, [x16, x2] + csel w2, w27, wzr, eq + orr w18, w2, w18 + mov x0, xzr + and w18, w18, #0xff + add x3, x3, #256 +.LBB2_5: + ldr x2, [x24, x0] + add x0, x0, #8 + cmp x0, #32 + add x2, x2, x3 + prfm pldl1keep, [x2] + b.ne .LBB2_5 + dup v22.4s, w18 + str q22, [sp, #192] + zip1 v27.4s, v0.4s, v21.4s + zip2 v21.4s, v0.4s, v21.4s + zip1 v0.4s, v7.4s, v16.4s + zip2 v22.4s, v7.4s, v16.4s + zip1 v7.4s, v1.4s, v3.4s + zip1 v25.4s, v2.4s, v4.4s + zip2 v16.4s, v2.4s, v4.4s + zip1 v11.4s, v19.4s, v23.4s + zip2 v12.4s, v19.4s, v23.4s + zip1 v19.4s, v13.4s, v14.4s + zip2 v23.4s, v13.4s, v14.4s + zip1 v13.4s, v29.4s, v10.4s + zip2 v14.4s, v29.4s, v10.4s + add v10.4s, v20.4s, v8.4s + add v2.4s, v26.4s, v9.4s + ext v20.16b, v22.16b, v21.16b, #8 + ext v26.16b, v25.16b, v7.16b, #8 + zip2 v24.4s, v1.4s, v3.4s + add v1.4s, v6.4s, v15.4s + ext v6.16b, v0.16b, v27.16b, #8 + ext v20.16b, v21.16b, v20.16b, #8 + mov v21.d[1], v22.d[0] + ext v22.16b, v7.16b, v26.16b, #8 + mov v7.d[1], v25.d[0] + add v3.4s, v17.4s, v31.4s + str q1, [sp, #144] + ext v1.16b, v27.16b, v6.16b, #8 + mov v6.16b, v7.16b + zip1 v28.4s, v5.4s, v18.4s + stur q1, [x29, #-80] + mov v1.16b, v27.16b + mov v27.16b, v24.16b + add v3.4s, v3.4s, v6.4s + ldr q6, [sp, #64] + ext v29.16b, v16.16b, v24.16b, #8 + mov v1.d[1], v0.d[0] + ext v0.16b, v11.16b, v28.16b, #8 + mov v27.d[1], v16.d[0] + ext v16.16b, v14.16b, v23.16b, #8 + stur q7, [x29, #-144] + ext v7.16b, v24.16b, v29.16b, #8 + ext v29.16b, v28.16b, v0.16b, #8 + ext v0.16b, v23.16b, v16.16b, #8 + mov v23.d[1], v14.d[0] + stp q0, q23, [sp, #80] + add v0.4s, v10.4s, v1.4s + eor v16.16b, v0.16b, v6.16b + ldr q6, [sp, #48] + add v2.4s, v2.4s, v21.4s + mov v28.d[1], v11.d[0] + zip2 v18.4s, v5.4s, v18.4s + eor v10.16b, v2.16b, v6.16b + movi v6.4s, #64 + eor v11.16b, v3.16b, v6.16b + ldr q6, [sp, #144] + dup v17.4s, w9 + ext v30.16b, v12.16b, v18.16b, #8 + rev32 v16.8h, v16.8h + dup v5.4s, w10 + ext v25.16b, v18.16b, v30.16b, #8 + mov v30.16b, v23.16b + mov v23.16b, v1.16b + str q1, [sp, #160] + rev32 v10.8h, v10.8h + add v1.4s, v16.4s, v17.4s + add v17.4s, v6.4s, v27.4s + ldr q6, [sp, #192] + dup v4.4s, w11 + rev32 v11.8h, v11.8h + add v5.4s, v10.4s, v5.4s + eor v8.16b, v1.16b, v8.16b + stur q21, [x29, #-128] + mov v18.d[1], v12.d[0] + add v4.4s, v11.4s, v4.4s + eor v9.16b, v5.16b, v9.16b + ushr v12.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + ldur q21, [x29, #-80] + ext v26.16b, v13.16b, v19.16b, #8 + eor v31.16b, v4.16b, v31.16b + orr v8.16b, v8.16b, v12.16b + ushr v12.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + ext v26.16b, v19.16b, v26.16b, #8 + mov v19.d[1], v13.d[0] + orr v9.16b, v9.16b, v12.16b + ushr v12.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v17.16b, v6.16b + orr v31.16b, v31.16b, v12.16b + dup v12.4s, w12 + rev32 v13.8h, v13.8h + add v12.4s, v13.4s, v12.4s + add v0.4s, v0.4s, v21.4s + eor v14.16b, v12.16b, v15.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v20.4s + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v22.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v28.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v18.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v19.4s + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v30.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + mov v24.16b, v7.16b + stur q7, [x29, #-112] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + mov v7.16b, v26.16b + add v3.4s, v3.4s, v26.4s + ldr q26, [sp, #80] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + add v0.4s, v0.4s, v29.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v25.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v13.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + str q22, [sp, #128] + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + ldur q22, [x29, #-128] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + eor v8.16b, v5.16b, v8.16b + mov v6.16b, v18.16b + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + ldur q18, [x29, #-144] + orr v8.16b, v8.16b, v15.16b + add v0.4s, v0.4s, v22.4s + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v20.4s + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v24.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v18.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v13.16b, v17.16b, v13.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v13.8h, v13.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v13.4s + add v0.4s, v0.4s, v27.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v6.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v23.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v21.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v19.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v29.4s + str q28, [sp, #112] + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldp q28, q23, [sp, #112] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ldr q21, [sp, #96] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + add v0.4s, v0.4s, v25.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v23.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v21.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v28.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v13.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + eor v8.16b, v5.16b, v8.16b + mov v30.16b, v29.16b + mov v29.16b, v25.16b + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + ldur q25, [x29, #-112] + orr v8.16b, v8.16b, v15.16b + add v0.4s, v0.4s, v20.4s + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v6.4s + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v7.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v25.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v13.16b, v17.16b, v13.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v13.8h, v13.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v13.4s + add v0.4s, v0.4s, v18.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v19.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v22.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v21.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v27.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v30.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v29.4s + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v28.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + ldr q24, [sp, #160] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + stur q7, [x29, #-64] + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + mov v7.16b, v26.16b + add v3.4s, v3.4s, v26.4s + ldur q26, [x29, #-80] + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + add v0.4s, v0.4s, v23.4s + orr v8.16b, v8.16b, v15.16b + add v15.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v24.4s + eor v0.16b, v15.16b, v13.16b + add v2.4s, v2.4s, v31.4s + ushr v13.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + orr v0.16b, v0.16b, v13.16b + ushr v13.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v13.16b + ushr v13.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v13.16b + ushr v13.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v13.16b + ushr v13.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + orr v9.16b, v9.16b, v13.16b + ushr v13.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + add v1.4s, v10.4s, v1.4s + orr v31.16b, v31.16b, v13.16b + eor v13.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + ushr v14.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v14.16b + ushr v14.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + stur q6, [x29, #-96] + orr v8.16b, v8.16b, v14.16b + add v14.4s, v15.4s, v6.4s + ldur q6, [x29, #-64] + mov v18.16b, v19.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v18.4s + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v21.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v6.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + str q27, [sp, #176] + mov v27.16b, v30.16b + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + add v14.4s, v14.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v27.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v20.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + mov v30.16b, v23.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v0.16b, v17.16b, v0.16b + add v1.4s, v16.4s, v1.4s + ldur q23, [x29, #-144] + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v0.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v23.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v29.4s + orr v13.16b, v13.16b, v15.16b + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v30.4s + rev32 v0.8h, v0.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + add v4.4s, v4.4s, v0.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldur q22, [x29, #-128] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + ldr q26, [sp, #176] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v5.4s, v11.4s + add v14.4s, v14.4s, v24.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v22.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v28.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v15.16b + add v14.4s, v14.4s, v18.4s + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v27.4s + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v7.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v21.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + add v14.4s, v14.4s, v6.4s + ldur q6, [x29, #-96] + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + stur q20, [x29, #-160] + mov v20.16b, v29.16b + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + mov v19.16b, v29.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v20.4s + mov v19.16b, v28.16b + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v6.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v19.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v0.16b, v17.16b, v0.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v0.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v25.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v30.4s + orr v13.16b, v13.16b, v15.16b + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v24.4s + rev32 v0.8h, v0.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + mov v29.16b, v27.16b + add v4.4s, v4.4s, v0.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldur q27, [x29, #-160] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ldur q6, [x29, #-80] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v5.4s, v11.4s + add v14.4s, v14.4s, v22.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v27.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v6.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v23.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v15.16b + add v14.4s, v14.4s, v29.4s + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v20.4s + mov v28.16b, v7.16b + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v19.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v28.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + add v14.4s, v14.4s, v21.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v30.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + add v3.4s, v3.4s, v18.4s + orr v10.16b, v10.16b, v15.16b + add v15.4s, v3.4s, v31.4s + eor v3.16b, v15.16b, v11.16b + ushr v11.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v11.16b, v3.16b, v11.16b + add v3.4s, v17.4s, v6.4s + add v17.4s, v3.4s, v13.4s + eor v0.16b, v17.16b, v0.16b + ushr v3.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + add v1.4s, v16.4s, v1.4s + orr v0.16b, v0.16b, v3.16b + eor v3.16b, v1.16b, v8.16b + ushr v8.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + add v5.4s, v10.4s, v5.4s + orr v8.16b, v3.16b, v8.16b + eor v3.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + ushr v9.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + eor v31.16b, v4.16b, v31.16b + mov v7.16b, v23.16b + mov v23.16b, v28.16b + mov v28.16b, v6.16b + orr v3.16b, v3.16b, v9.16b + ushr v9.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + ldur q6, [x29, #-64] + orr v31.16b, v31.16b, v9.16b + add v9.4s, v0.4s, v12.4s + eor v12.16b, v9.16b, v13.16b + ushr v13.4s, v12.4s, #7 + shl v12.4s, v12.4s, #25 + orr v12.16b, v12.16b, v13.16b + add v13.4s, v14.4s, v6.4s + add v13.4s, v13.4s, v3.4s + eor v0.16b, v13.16b, v0.16b + add v2.4s, v2.4s, v24.4s + rev32 v14.8h, v0.8h + add v0.4s, v2.4s, v31.4s + add v6.4s, v4.4s, v14.4s + eor v2.16b, v0.16b, v16.16b + eor v3.16b, v6.16b, v3.16b + rev32 v16.8h, v2.8h + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v2.4s, v9.4s, v16.4s + orr v4.16b, v3.16b, v4.16b + eor v3.16b, v2.16b, v31.16b + ushr v31.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v31.16b + add v31.4s, v15.4s, v22.4s + add v31.4s, v31.4s, v12.4s + add v17.4s, v17.4s, v7.4s + eor v9.16b, v31.16b, v10.16b + add v17.4s, v17.4s, v8.4s + rev32 v9.8h, v9.8h + eor v11.16b, v17.16b, v11.16b + add v1.4s, v1.4s, v9.4s + rev32 v11.8h, v11.8h + eor v10.16b, v1.16b, v12.16b + add v5.4s, v5.4s, v11.4s + ushr v12.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v8.16b, v5.16b, v8.16b + orr v10.16b, v10.16b, v12.16b + ushr v12.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + orr v8.16b, v8.16b, v12.16b + add v12.4s, v13.4s, v27.4s + add v12.4s, v12.4s, v4.4s + eor v13.16b, v12.16b, v14.16b + ldur q14, [x29, #-96] + mov v25.16b, v29.16b + add v29.4s, v12.4s, v20.4s + add v20.4s, v31.4s, v26.4s + add v0.4s, v0.4s, v14.4s + add v0.4s, v0.4s, v3.4s + eor v16.16b, v0.16b, v16.16b + add v0.4s, v0.4s, v30.4s + ldur q30, [x29, #-112] + add v20.4s, v20.4s, v10.4s + eor v31.16b, v20.16b, v9.16b + add v20.4s, v20.4s, v28.4s + add v17.4s, v17.4s, v30.4s + add v17.4s, v17.4s, v8.4s + eor v9.16b, v17.16b, v11.16b + ushr v28.4s, v13.4s, #8 + shl v11.4s, v13.4s, #24 + orr v28.16b, v11.16b, v28.16b + ushr v11.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v16.16b, v16.16b, v11.16b + ushr v11.4s, v31.4s, #8 + shl v31.4s, v31.4s, #24 + add v6.4s, v28.4s, v6.4s + orr v31.16b, v31.16b, v11.16b + ushr v11.4s, v9.4s, #8 + shl v9.4s, v9.4s, #24 + add v2.4s, v16.4s, v2.4s + eor v4.16b, v6.16b, v4.16b + orr v9.16b, v9.16b, v11.16b + add v1.4s, v31.4s, v1.4s + eor v3.16b, v2.16b, v3.16b + ushr v11.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v5.4s, v9.4s, v5.4s + eor v10.16b, v1.16b, v10.16b + orr v4.16b, v4.16b, v11.16b + ushr v11.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v3.16b, v3.16b, v11.16b + ushr v11.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + orr v10.16b, v10.16b, v11.16b + ushr v11.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v11.16b + add v29.4s, v29.4s, v8.4s + eor v16.16b, v29.16b, v16.16b + add v0.4s, v0.4s, v4.4s + mov v12.16b, v26.16b + add v17.4s, v17.4s, v19.4s + add v26.4s, v29.4s, v23.4s + eor v29.16b, v0.16b, v31.16b + add v20.4s, v20.4s, v3.4s + rev32 v16.8h, v16.8h + stur q18, [x29, #-176] + mov v18.16b, v27.16b + add v0.4s, v0.4s, v24.4s + eor v27.16b, v20.16b, v9.16b + add v17.4s, v17.4s, v10.4s + rev32 v24.8h, v29.8h + add v1.4s, v1.4s, v16.4s + add v20.4s, v20.4s, v25.4s + eor v25.16b, v17.16b, v28.16b + rev32 v27.8h, v27.8h + add v5.4s, v5.4s, v24.4s + eor v28.16b, v1.16b, v8.16b + rev32 v25.8h, v25.8h + add v6.4s, v6.4s, v27.4s + eor v4.16b, v5.16b, v4.16b + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + add v2.4s, v2.4s, v25.4s + eor v3.16b, v6.16b, v3.16b + orr v28.16b, v28.16b, v31.16b + ushr v31.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v29.16b, v2.16b, v10.16b + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v26.4s, v26.4s, v28.4s + orr v3.16b, v3.16b, v31.16b + ushr v31.4s, v29.4s, #12 + shl v29.4s, v29.4s, #20 + eor v16.16b, v26.16b, v16.16b + add v0.4s, v0.4s, v4.4s + add v17.4s, v17.4s, v12.4s + orr v29.16b, v29.16b, v31.16b + eor v24.16b, v0.16b, v24.16b + add v0.4s, v0.4s, v22.4s + add v20.4s, v20.4s, v3.4s + ushr v22.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + add v23.4s, v26.4s, v21.4s + eor v21.16b, v20.16b, v27.16b + add v17.4s, v17.4s, v29.4s + orr v16.16b, v16.16b, v22.16b + ushr v22.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v25.16b, v17.16b, v25.16b + orr v22.16b, v24.16b, v22.16b + ushr v24.4s, v21.4s, #8 + shl v21.4s, v21.4s, #24 + orr v21.16b, v21.16b, v24.16b + ushr v24.4s, v25.4s, #8 + shl v25.4s, v25.4s, #24 + add v1.4s, v16.4s, v1.4s + orr v24.16b, v25.16b, v24.16b + add v5.4s, v22.4s, v5.4s + eor v25.16b, v1.16b, v28.16b + add v6.4s, v21.4s, v6.4s + eor v4.16b, v5.16b, v4.16b + ushr v27.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + add v2.4s, v24.4s, v2.4s + eor v3.16b, v6.16b, v3.16b + orr v25.16b, v25.16b, v27.16b + ushr v27.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + ldur q19, [x29, #-176] + eor v26.16b, v2.16b, v29.16b + orr v4.16b, v4.16b, v27.16b + ushr v27.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + orr v3.16b, v3.16b, v27.16b + ushr v27.4s, v26.4s, #7 + shl v26.4s, v26.4s, #25 + add v20.4s, v20.4s, v18.4s + add v17.4s, v17.4s, v30.4s + orr v26.16b, v26.16b, v27.16b + add v0.4s, v0.4s, v3.4s + eor v16.16b, v0.16b, v16.16b + add v0.4s, v0.4s, v19.4s + add v19.4s, v20.4s, v26.4s + add v17.4s, v17.4s, v25.4s + eor v20.16b, v19.16b, v22.16b + add v7.4s, v19.4s, v7.4s + eor v19.16b, v17.16b, v21.16b + ldur q21, [x29, #-64] + add v23.4s, v23.4s, v4.4s + eor v24.16b, v23.16b, v24.16b + rev32 v16.8h, v16.8h + add v17.4s, v17.4s, v21.4s + rev32 v21.8h, v24.8h + add v6.4s, v6.4s, v21.4s + rev32 v20.8h, v20.8h + add v2.4s, v2.4s, v16.4s + eor v4.16b, v6.16b, v4.16b + rev32 v19.8h, v19.8h + add v1.4s, v1.4s, v20.4s + eor v3.16b, v2.16b, v3.16b + ushr v24.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v5.4s, v5.4s, v19.4s + eor v22.16b, v1.16b, v26.16b + orr v4.16b, v4.16b, v24.16b + ushr v24.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v18.4s, v23.4s, v14.4s + eor v23.16b, v5.16b, v25.16b + orr v3.16b, v3.16b, v24.16b + ushr v24.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + orr v22.16b, v22.16b, v24.16b + ushr v24.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v24.16b + add v18.4s, v18.4s, v4.4s + add v0.4s, v0.4s, v3.4s + add v24.4s, v17.4s, v23.4s + eor v17.16b, v18.16b, v21.16b + add v7.4s, v7.4s, v22.4s + eor v16.16b, v0.16b, v16.16b + ushr v21.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + eor v20.16b, v7.16b, v20.16b + orr v21.16b, v17.16b, v21.16b + ushr v17.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v19.16b, v24.16b, v19.16b + orr v16.16b, v16.16b, v17.16b + ushr v17.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + orr v25.16b, v20.16b, v17.16b + ushr v17.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v17.16b + add v1.4s, v25.4s, v1.4s + eor v22.16b, v1.16b, v22.16b + eor v20.16b, v1.16b, v18.16b + add v1.4s, v19.4s, v5.4s + eor v26.16b, v1.16b, v0.16b + add v0.4s, v21.4s, v6.4s + eor v5.16b, v1.16b, v23.16b + eor v1.16b, v0.16b, v4.16b + eor v17.16b, v0.16b, v7.16b + add v0.4s, v16.4s, v2.4s + eor v2.16b, v0.16b, v3.16b + eor v6.16b, v0.16b, v24.16b + ushr v0.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v0.16b, v1.16b, v0.16b + ushr v1.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v2.16b, v1.16b + ushr v2.4s, v22.4s, #7 + shl v3.4s, v22.4s, #25 + orr v2.16b, v3.16b, v2.16b + ushr v3.4s, v5.4s, #7 + shl v4.4s, v5.4s, #25 + orr v3.16b, v4.16b, v3.16b + eor v8.16b, v16.16b, v3.16b + eor v9.16b, v25.16b, v0.16b + eor v31.16b, v1.16b, v19.16b + cmp x17, x22 + eor v15.16b, v2.16b, v21.16b + mov w18, w19 + b.ne .LBB2_4 +.LBB2_7: + zip1 v0.4s, v20.4s, v26.4s + zip2 v1.4s, v20.4s, v26.4s + zip1 v2.4s, v17.4s, v6.4s + zip2 v3.4s, v17.4s, v6.4s + zip1 v4.4s, v8.4s, v9.4s + zip2 v5.4s, v8.4s, v9.4s + zip1 v6.4s, v31.4s, v15.4s + zip2 v7.4s, v31.4s, v15.4s + add x13, x20, #4 + tst w5, #0x1 + sub x28, x28, #4 + zip1 v16.2d, v0.2d, v2.2d + zip2 v0.2d, v0.2d, v2.2d + zip1 v2.2d, v1.2d, v3.2d + zip2 v1.2d, v1.2d, v3.2d + zip1 v3.2d, v4.2d, v6.2d + zip2 v4.2d, v4.2d, v6.2d + zip1 v6.2d, v5.2d, v7.2d + zip2 v5.2d, v5.2d, v7.2d + add x24, x24, #32 + csel x20, x13, x20, ne + cmp x28, #3 + stp q16, q3, [x26] + stp q0, q4, [x26, #32] + stp q2, q6, [x26, #64] + stp q1, q5, [x26, #96] + add x26, x26, #128 + b.hi .LBB2_2 +.LBB2_8: + cbz x28, .LBB2_16 + orr w8, w7, w19 + and x21, x5, #0x1 + stur w8, [x29, #-64] +.LBB2_10: + ldr x8, [sp, #40] + ldr x25, [x24] + ldur w4, [x29, #-64] + ldp q1, q0, [x8] + mov x8, x22 + stp q1, q0, [x29, #-48] +.LBB2_11: + subs x23, x8, #1 + b.eq .LBB2_13 + cbnz x8, .LBB2_14 + b .LBB2_15 +.LBB2_13: + orr w4, w4, w27 +.LBB2_14: + sub x0, x29, #48 + mov w2, #64 + mov x1, x25 + mov x3, x20 + bl zfs_blake3_compress_in_place_sse2 + add x25, x25, #64 + mov x8, x23 + mov w4, w19 + b .LBB2_11 +.LBB2_15: + ldp q0, q1, [x29, #-48] + add x20, x20, x21 + add x24, x24, #8 + subs x28, x28, #1 + stp q0, q1, [x26], #32 + b.ne .LBB2_10 +.LBB2_16: + add sp, sp, #384 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] + ldp d9, d8, [sp, #48] + ldp d11, d10, [sp, #32] + ldp d13, d12, [sp, #16] + ldp d15, d14, [sp], #160 + ret +.Lfunc_end2: + .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S new file mode 100644 index 000000000..eb6946400 --- /dev/null +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -0,0 +1,2463 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE4.1 -> ARMv8-A + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if defined(__aarch64__) + .text + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI0_1: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI0_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI0_3: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LCPI0_4: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: + .cfi_startproc + ldp q7, q6, [x0] + ldp q17, q18, [x1] + add x12, x1, #32 + ld2 { v4.4s, v5.4s }, [x12] + lsr x10, x3, #32 + fmov s16, w3 + adrp x13, .LCPI0_0 + adrp x11, .LCPI0_1 + and w8, w2, #0xff + mov v16.s[1], w10 + ldr q0, [x13, :lo12:.LCPI0_0] + ldr q20, [x11, :lo12:.LCPI0_1] + adrp x11, .LCPI0_4 + and w9, w4, #0xff + ldr q2, [x11, :lo12:.LCPI0_4] + mov v16.s[2], w8 + uzp1 v21.4s, v17.4s, v18.4s + add v7.4s, v6.4s, v7.4s + adrp x12, .LCPI0_3 + mov v16.s[3], w9 + uzp2 v18.4s, v17.4s, v18.4s + add v7.4s, v7.4s, v21.4s + ext v17.16b, v5.16b, v5.16b, #12 + ldr q3, [x12, :lo12:.LCPI0_3] + ext v24.16b, v4.16b, v4.16b, #12 + eor v16.16b, v7.16b, v16.16b + mov v27.16b, v17.16b + uzp1 v19.4s, v21.4s, v21.4s + ext v25.16b, v21.16b, v21.16b, #12 + zip2 v28.4s, v18.4s, v17.4s + tbl v29.16b, { v16.16b }, v0.16b + mov v27.s[1], v24.s[2] + zip1 v23.2d, v17.2d, v18.2d + ext v19.16b, v19.16b, v21.16b, #8 + add v22.4s, v29.4s, v20.4s + ext v26.16b, v21.16b, v25.16b, #12 + tbl v20.16b, { v23.16b, v24.16b }, v2.16b + zip1 v21.4s, v28.4s, v24.4s + zip1 v23.4s, v24.4s, v28.4s + uzp2 v19.4s, v19.4s, v18.4s + eor v24.16b, v22.16b, v6.16b + ext v25.16b, v20.16b, v20.16b, #12 + ext v6.16b, v23.16b, v21.16b, #8 + add v7.4s, v7.4s, v18.4s + ext v18.16b, v19.16b, v19.16b, #4 + tbl v16.16b, { v26.16b, v27.16b }, v3.16b + uzp1 v21.4s, v20.4s, v25.4s + mov v26.16b, v6.16b + ext v23.16b, v18.16b, v18.16b, #12 + mov v26.s[1], v21.s[2] + adrp x10, .LCPI0_2 + ext v25.16b, v18.16b, v23.16b, #12 + uzp1 v23.4s, v18.4s, v18.4s + ldr q1, [x10, :lo12:.LCPI0_2] + ext v18.16b, v23.16b, v18.16b, #8 + ushr v23.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + orr v23.16b, v24.16b, v23.16b + add v7.4s, v7.4s, v23.4s + eor v27.16b, v29.16b, v7.16b + add v4.4s, v7.4s, v4.4s + tbl v7.16b, { v25.16b, v26.16b }, v3.16b + tbl v26.16b, { v27.16b }, v1.16b + add v22.4s, v22.4s, v26.4s + uzp2 v18.4s, v18.4s, v16.4s + eor v23.16b, v23.16b, v22.16b + ext v5.16b, v18.16b, v18.16b, #4 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + uzp1 v25.4s, v5.4s, v5.4s + orr v23.16b, v23.16b, v27.16b + ext v28.16b, v4.16b, v4.16b, #12 + ext v4.16b, v25.16b, v5.16b, #8 + ext v25.16b, v26.16b, v26.16b, #8 + add v26.4s, v28.4s, v23.4s + eor v25.16b, v26.16b, v25.16b + ext v22.16b, v22.16b, v22.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v22.4s, v22.4s, v25.4s + eor v23.16b, v23.16b, v22.16b + add v17.4s, v26.4s, v17.4s + ushr v26.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v26.16b + add v17.4s, v17.4s, v23.4s + eor v25.16b, v25.16b, v17.16b + add v17.4s, v17.4s, v19.4s + tbl v19.16b, { v25.16b }, v1.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ext v17.16b, v17.16b, v17.16b, #4 + orr v23.16b, v23.16b, v25.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v17.4s, v17.4s, v23.4s + eor v19.16b, v17.16b, v19.16b + ext v22.16b, v22.16b, v22.16b, #12 + tbl v19.16b, { v19.16b }, v0.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v17.4s, v17.4s, v16.4s + orr v23.16b, v23.16b, v25.16b + add v17.4s, v17.4s, v23.4s + ext v25.16b, v17.16b, v17.16b, #12 + eor v17.16b, v19.16b, v17.16b + tbl v17.16b, { v17.16b }, v1.16b + add v19.4s, v22.4s, v17.4s + eor v22.16b, v23.16b, v19.16b + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v6.2d, v16.2d + ushr v23.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + zip2 v24.4s, v16.4s, v6.4s + tbl v26.16b, { v20.16b, v21.16b }, v2.16b + orr v22.16b, v22.16b, v23.16b + zip1 v16.4s, v24.4s, v21.4s + zip1 v20.4s, v21.4s, v24.4s + ext v21.16b, v26.16b, v26.16b, #12 + ext v17.16b, v17.16b, v17.16b, #8 + add v25.4s, v25.4s, v22.4s + ext v16.16b, v20.16b, v16.16b, #8 + uzp1 v21.4s, v26.4s, v21.4s + eor v26.16b, v25.16b, v17.16b + ext v19.16b, v19.16b, v19.16b, #4 + tbl v26.16b, { v26.16b }, v0.16b + mov v29.16b, v16.16b + add v19.4s, v19.4s, v26.4s + ext v27.16b, v5.16b, v5.16b, #12 + mov v29.s[1], v21.s[2] + eor v22.16b, v22.16b, v19.16b + ext v28.16b, v5.16b, v27.16b, #12 + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v25.4s, v6.4s + orr v22.16b, v22.16b, v27.16b + add v6.4s, v6.4s, v22.4s + eor v26.16b, v26.16b, v6.16b + add v6.4s, v6.4s, v18.4s + tbl v18.16b, { v26.16b }, v1.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v22.16b, v22.16b, v26.16b + ext v18.16b, v18.16b, v18.16b, #8 + add v6.4s, v6.4s, v22.4s + eor v18.16b, v6.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v18.16b, { v18.16b }, v0.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v6.4s, v7.4s + orr v22.16b, v22.16b, v26.16b + add v6.4s, v6.4s, v22.4s + ext v26.16b, v6.16b, v6.16b, #12 + eor v6.16b, v18.16b, v6.16b + uzp2 v4.4s, v4.4s, v7.4s + zip2 v25.4s, v7.4s, v16.4s + add v26.4s, v26.4s, v21.4s + zip1 v20.2d, v16.2d, v7.2d + tbl v6.16b, { v6.16b }, v1.16b + ext v24.16b, v4.16b, v4.16b, #4 + tbl v27.16b, { v20.16b, v21.16b }, v2.16b + zip1 v7.4s, v25.4s, v21.4s + zip1 v20.4s, v21.4s, v25.4s + add v18.4s, v19.4s, v6.4s + uzp1 v5.4s, v24.4s, v24.4s + ext v21.16b, v27.16b, v27.16b, #12 + ext v7.16b, v20.16b, v7.16b, #8 + eor v19.16b, v22.16b, v18.16b + ext v5.16b, v5.16b, v24.16b, #8 + tbl v17.16b, { v28.16b, v29.16b }, v3.16b + uzp1 v21.4s, v27.4s, v21.4s + mov v28.16b, v7.16b + ushr v22.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v23.16b, v24.16b, v24.16b, #12 + uzp2 v5.4s, v5.4s, v17.4s + mov v28.s[1], v21.s[2] + orr v19.16b, v19.16b, v22.16b + ext v27.16b, v24.16b, v23.16b, #12 + ext v23.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #8 + ext v25.16b, v18.16b, v18.16b, #4 + add v18.4s, v26.4s, v19.4s + uzp1 v24.4s, v23.4s, v23.4s + eor v6.16b, v18.16b, v6.16b + ext v24.16b, v24.16b, v23.16b, #8 + add v16.4s, v18.4s, v16.4s + tbl v18.16b, { v27.16b, v28.16b }, v3.16b + tbl v27.16b, { v6.16b }, v0.16b + uzp2 v6.4s, v24.4s, v18.4s + add v24.4s, v25.4s, v27.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v25.16b + add v16.4s, v16.4s, v19.4s + eor v25.16b, v27.16b, v16.16b + add v4.4s, v16.4s, v4.4s + tbl v16.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v16.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v16.16b, v16.16b, v16.16b, #8 + add v4.4s, v4.4s, v19.4s + eor v16.16b, v4.16b, v16.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v25.16b, { v16.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v16.16b, v19.16b, v24.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v17.4s + orr v19.16b, v16.16b, v19.16b + add v27.4s, v4.4s, v19.4s + eor v25.16b, v25.16b, v27.16b + tbl v25.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v25.4s + zip2 v26.4s, v17.4s, v7.4s + ext v4.16b, v27.16b, v27.16b, #12 + eor v19.16b, v19.16b, v24.16b + add v28.4s, v4.4s, v21.4s + zip1 v20.2d, v7.2d, v17.2d + zip1 v4.4s, v26.4s, v21.4s + zip1 v17.4s, v21.4s, v26.4s + ushr v26.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v26.16b + ext v25.16b, v25.16b, v25.16b, #8 + add v27.4s, v28.4s, v19.4s + eor v25.16b, v27.16b, v25.16b + ext v24.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v19.16b, v19.16b, v24.16b + add v7.4s, v27.4s, v7.4s + ushr v27.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v27.16b + add v7.4s, v7.4s, v19.4s + eor v25.16b, v25.16b, v7.16b + add v5.4s, v7.4s, v5.4s + tbl v7.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v7.16b, v7.16b, v7.16b, #8 + add v5.4s, v5.4s, v19.4s + eor v7.16b, v5.16b, v7.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v7.16b, { v7.16b }, v0.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + tbl v16.16b, { v20.16b, v21.16b }, v2.16b + add v5.4s, v5.4s, v18.4s + orr v19.16b, v19.16b, v25.16b + ext v20.16b, v16.16b, v16.16b, #12 + ext v4.16b, v17.16b, v4.16b, #8 + add v5.4s, v5.4s, v19.4s + uzp1 v21.4s, v16.4s, v20.4s + mov v17.16b, v4.16b + ext v25.16b, v5.16b, v5.16b, #12 + mov v17.s[1], v21.s[2] + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v4.2d, v18.2d + ext v22.16b, v23.16b, v23.16b, #12 + zip2 v26.4s, v18.4s, v4.4s + tbl v18.16b, { v20.16b, v21.16b }, v2.16b + eor v5.16b, v7.16b, v5.16b + ext v16.16b, v23.16b, v22.16b, #12 + ext v22.16b, v6.16b, v6.16b, #4 + zip1 v27.4s, v26.4s, v21.4s + zip1 v20.4s, v21.4s, v26.4s + ext v21.16b, v18.16b, v18.16b, #12 + tbl v5.16b, { v5.16b }, v1.16b + ext v20.16b, v20.16b, v27.16b, #8 + uzp1 v27.4s, v18.4s, v21.4s + uzp1 v18.4s, v22.4s, v22.4s + add v21.4s, v24.4s, v5.4s + ext v18.16b, v18.16b, v22.16b, #8 + eor v19.16b, v19.16b, v21.16b + tbl v7.16b, { v16.16b, v17.16b }, v3.16b + uzp2 v18.4s, v18.4s, v17.4s + zip2 v16.4s, v16.4s, v20.4s + ushr v17.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v17.16b, v19.16b, v17.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v19.4s, v25.4s, v17.4s + eor v5.16b, v19.16b, v5.16b + ext v21.16b, v21.16b, v21.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v4.4s, v19.4s, v4.4s + add v19.4s, v21.4s, v5.4s + eor v17.16b, v17.16b, v19.16b + ushr v21.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v21.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v4.4s, v4.4s, v6.4s + add v6.4s, v19.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v17.16b, v19.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v17.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v7.4s + orr v17.16b, v17.16b, v19.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + mov v29.16b, v20.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v29.s[1], v27.s[2] + add v4.4s, v4.4s, v27.4s + zip1 v26.2d, v20.2d, v7.2d + zip1 v7.4s, v16.4s, v27.4s + zip1 v16.4s, v27.4s, v16.4s + eor v17.16b, v17.16b, v6.16b + ext v7.16b, v16.16b, v7.16b, #8 + ushr v16.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v16.16b, v17.16b, v16.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + ushr v17.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v20.4s + orr v16.16b, v16.16b, v17.16b + add v4.4s, v4.4s, v16.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + add v4.4s, v4.4s, v18.4s + ushr v17.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + ext v23.16b, v22.16b, v22.16b, #12 + ext v4.16b, v4.16b, v4.16b, #4 + orr v16.16b, v16.16b, v17.16b + ext v28.16b, v22.16b, v23.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v16.4s, v4.4s + tbl v3.16b, { v28.16b, v29.16b }, v3.16b + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v4.4s, v3.4s + tbl v4.16b, { v5.16b }, v0.16b + add v5.4s, v6.4s, v4.4s + eor v6.16b, v16.16b, v5.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v16.16b + tbl v2.16b, { v26.16b, v27.16b }, v2.16b + add v3.4s, v3.4s, v6.4s + ext v19.16b, v2.16b, v2.16b, #12 + eor v4.16b, v4.16b, v3.16b + uzp1 v2.4s, v2.4s, v19.4s + ext v3.16b, v3.16b, v3.16b, #12 + tbl v4.16b, { v4.16b }, v1.16b + add v2.4s, v3.4s, v2.4s + add v3.4s, v5.4s, v4.4s + eor v5.16b, v6.16b, v3.16b + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v6.16b + ext v4.16b, v4.16b, v4.16b, #8 + add v2.4s, v2.4s, v5.4s + eor v4.16b, v2.16b, v4.16b + ext v3.16b, v3.16b, v3.16b, #4 + tbl v0.16b, { v4.16b }, v0.16b + add v3.4s, v3.4s, v0.4s + eor v4.16b, v5.16b, v3.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v2.4s, v2.4s, v7.4s + orr v4.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v4.4s + eor v0.16b, v0.16b, v2.16b + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v3.4s, v0.4s + eor v3.16b, v4.16b, v1.16b + ext v2.16b, v2.16b, v2.16b, #4 + ext v1.16b, v1.16b, v1.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v0.16b, v0.16b, v0.16b, #8 + eor v1.16b, v2.16b, v1.16b + orr v2.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + stp q1, q0, [x0] + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI1_1: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI1_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI1_3: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LCPI1_4: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 + .text + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: + .cfi_startproc + ldp q7, q6, [x0] + ldp q17, q18, [x1] + add x12, x1, #32 + ld2 { v4.4s, v5.4s }, [x12] + lsr x10, x3, #32 + fmov s16, w3 + adrp x13, .LCPI1_0 + adrp x11, .LCPI1_1 + and w8, w2, #0xff + mov v16.s[1], w10 + ldr q0, [x13, :lo12:.LCPI1_0] + ldr q20, [x11, :lo12:.LCPI1_1] + adrp x11, .LCPI1_4 + and w9, w4, #0xff + ldr q2, [x11, :lo12:.LCPI1_4] + mov v16.s[2], w8 + uzp1 v21.4s, v17.4s, v18.4s + add v7.4s, v6.4s, v7.4s + adrp x12, .LCPI1_3 + mov v16.s[3], w9 + uzp2 v18.4s, v17.4s, v18.4s + add v7.4s, v7.4s, v21.4s + ext v17.16b, v5.16b, v5.16b, #12 + ldr q3, [x12, :lo12:.LCPI1_3] + ext v24.16b, v4.16b, v4.16b, #12 + eor v16.16b, v7.16b, v16.16b + mov v27.16b, v17.16b + uzp1 v19.4s, v21.4s, v21.4s + ext v25.16b, v21.16b, v21.16b, #12 + zip2 v28.4s, v18.4s, v17.4s + tbl v29.16b, { v16.16b }, v0.16b + mov v27.s[1], v24.s[2] + zip1 v23.2d, v17.2d, v18.2d + ext v19.16b, v19.16b, v21.16b, #8 + add v22.4s, v29.4s, v20.4s + ext v26.16b, v21.16b, v25.16b, #12 + tbl v20.16b, { v23.16b, v24.16b }, v2.16b + zip1 v21.4s, v28.4s, v24.4s + zip1 v23.4s, v24.4s, v28.4s + uzp2 v19.4s, v19.4s, v18.4s + eor v24.16b, v22.16b, v6.16b + ext v25.16b, v20.16b, v20.16b, #12 + ext v6.16b, v23.16b, v21.16b, #8 + add v7.4s, v7.4s, v18.4s + ext v18.16b, v19.16b, v19.16b, #4 + tbl v16.16b, { v26.16b, v27.16b }, v3.16b + uzp1 v21.4s, v20.4s, v25.4s + mov v26.16b, v6.16b + ext v23.16b, v18.16b, v18.16b, #12 + mov v26.s[1], v21.s[2] + adrp x10, .LCPI1_2 + ext v25.16b, v18.16b, v23.16b, #12 + uzp1 v23.4s, v18.4s, v18.4s + ldr q1, [x10, :lo12:.LCPI1_2] + ext v18.16b, v23.16b, v18.16b, #8 + ushr v23.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + orr v23.16b, v24.16b, v23.16b + add v7.4s, v7.4s, v23.4s + eor v27.16b, v29.16b, v7.16b + add v4.4s, v7.4s, v4.4s + tbl v7.16b, { v25.16b, v26.16b }, v3.16b + tbl v26.16b, { v27.16b }, v1.16b + add v22.4s, v22.4s, v26.4s + uzp2 v18.4s, v18.4s, v16.4s + eor v23.16b, v23.16b, v22.16b + ext v5.16b, v18.16b, v18.16b, #4 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + uzp1 v25.4s, v5.4s, v5.4s + orr v23.16b, v23.16b, v27.16b + ext v28.16b, v4.16b, v4.16b, #12 + ext v4.16b, v25.16b, v5.16b, #8 + ext v25.16b, v26.16b, v26.16b, #8 + add v26.4s, v28.4s, v23.4s + eor v25.16b, v26.16b, v25.16b + ext v22.16b, v22.16b, v22.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v22.4s, v22.4s, v25.4s + eor v23.16b, v23.16b, v22.16b + add v17.4s, v26.4s, v17.4s + ushr v26.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v26.16b + add v17.4s, v17.4s, v23.4s + eor v25.16b, v25.16b, v17.16b + add v17.4s, v17.4s, v19.4s + tbl v19.16b, { v25.16b }, v1.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ext v17.16b, v17.16b, v17.16b, #4 + orr v23.16b, v23.16b, v25.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v17.4s, v17.4s, v23.4s + eor v19.16b, v17.16b, v19.16b + ext v22.16b, v22.16b, v22.16b, #12 + tbl v19.16b, { v19.16b }, v0.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v17.4s, v17.4s, v16.4s + orr v23.16b, v23.16b, v25.16b + add v17.4s, v17.4s, v23.4s + ext v25.16b, v17.16b, v17.16b, #12 + eor v17.16b, v19.16b, v17.16b + tbl v17.16b, { v17.16b }, v1.16b + add v19.4s, v22.4s, v17.4s + eor v22.16b, v23.16b, v19.16b + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v6.2d, v16.2d + ushr v23.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + zip2 v24.4s, v16.4s, v6.4s + tbl v26.16b, { v20.16b, v21.16b }, v2.16b + orr v22.16b, v22.16b, v23.16b + zip1 v16.4s, v24.4s, v21.4s + zip1 v20.4s, v21.4s, v24.4s + ext v21.16b, v26.16b, v26.16b, #12 + ext v17.16b, v17.16b, v17.16b, #8 + add v25.4s, v25.4s, v22.4s + ext v16.16b, v20.16b, v16.16b, #8 + uzp1 v21.4s, v26.4s, v21.4s + eor v26.16b, v25.16b, v17.16b + ext v19.16b, v19.16b, v19.16b, #4 + tbl v26.16b, { v26.16b }, v0.16b + mov v29.16b, v16.16b + add v19.4s, v19.4s, v26.4s + ext v27.16b, v5.16b, v5.16b, #12 + mov v29.s[1], v21.s[2] + eor v22.16b, v22.16b, v19.16b + ext v28.16b, v5.16b, v27.16b, #12 + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v25.4s, v6.4s + orr v22.16b, v22.16b, v27.16b + add v6.4s, v6.4s, v22.4s + eor v26.16b, v26.16b, v6.16b + add v6.4s, v6.4s, v18.4s + tbl v18.16b, { v26.16b }, v1.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v22.16b, v22.16b, v26.16b + ext v18.16b, v18.16b, v18.16b, #8 + add v6.4s, v6.4s, v22.4s + eor v18.16b, v6.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v18.16b, { v18.16b }, v0.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v6.4s, v7.4s + orr v22.16b, v22.16b, v26.16b + add v6.4s, v6.4s, v22.4s + ext v26.16b, v6.16b, v6.16b, #12 + eor v6.16b, v18.16b, v6.16b + uzp2 v4.4s, v4.4s, v7.4s + zip2 v25.4s, v7.4s, v16.4s + add v26.4s, v26.4s, v21.4s + zip1 v20.2d, v16.2d, v7.2d + tbl v6.16b, { v6.16b }, v1.16b + ext v24.16b, v4.16b, v4.16b, #4 + tbl v27.16b, { v20.16b, v21.16b }, v2.16b + zip1 v7.4s, v25.4s, v21.4s + zip1 v20.4s, v21.4s, v25.4s + add v18.4s, v19.4s, v6.4s + uzp1 v5.4s, v24.4s, v24.4s + ext v21.16b, v27.16b, v27.16b, #12 + ext v7.16b, v20.16b, v7.16b, #8 + eor v19.16b, v22.16b, v18.16b + ext v5.16b, v5.16b, v24.16b, #8 + tbl v17.16b, { v28.16b, v29.16b }, v3.16b + uzp1 v21.4s, v27.4s, v21.4s + mov v28.16b, v7.16b + ushr v22.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v23.16b, v24.16b, v24.16b, #12 + uzp2 v5.4s, v5.4s, v17.4s + mov v28.s[1], v21.s[2] + orr v19.16b, v19.16b, v22.16b + ext v27.16b, v24.16b, v23.16b, #12 + ext v23.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #8 + ext v25.16b, v18.16b, v18.16b, #4 + add v18.4s, v26.4s, v19.4s + uzp1 v24.4s, v23.4s, v23.4s + eor v6.16b, v18.16b, v6.16b + ext v24.16b, v24.16b, v23.16b, #8 + add v16.4s, v18.4s, v16.4s + tbl v18.16b, { v27.16b, v28.16b }, v3.16b + tbl v27.16b, { v6.16b }, v0.16b + uzp2 v6.4s, v24.4s, v18.4s + add v24.4s, v25.4s, v27.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v25.16b + add v16.4s, v16.4s, v19.4s + eor v25.16b, v27.16b, v16.16b + add v4.4s, v16.4s, v4.4s + tbl v16.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v16.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v16.16b, v16.16b, v16.16b, #8 + add v4.4s, v4.4s, v19.4s + eor v16.16b, v4.16b, v16.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v25.16b, { v16.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v16.16b, v19.16b, v24.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v17.4s + orr v19.16b, v16.16b, v19.16b + add v27.4s, v4.4s, v19.4s + eor v25.16b, v25.16b, v27.16b + tbl v25.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v25.4s + zip2 v26.4s, v17.4s, v7.4s + ext v4.16b, v27.16b, v27.16b, #12 + eor v19.16b, v19.16b, v24.16b + add v28.4s, v4.4s, v21.4s + zip1 v20.2d, v7.2d, v17.2d + zip1 v4.4s, v26.4s, v21.4s + zip1 v17.4s, v21.4s, v26.4s + ushr v26.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v26.16b + ext v25.16b, v25.16b, v25.16b, #8 + add v27.4s, v28.4s, v19.4s + eor v25.16b, v27.16b, v25.16b + ext v24.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v19.16b, v19.16b, v24.16b + add v7.4s, v27.4s, v7.4s + ushr v27.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v27.16b + add v7.4s, v7.4s, v19.4s + eor v25.16b, v25.16b, v7.16b + add v5.4s, v7.4s, v5.4s + tbl v7.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v7.16b, v7.16b, v7.16b, #8 + add v5.4s, v5.4s, v19.4s + eor v7.16b, v5.16b, v7.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v7.16b, { v7.16b }, v0.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + tbl v16.16b, { v20.16b, v21.16b }, v2.16b + add v5.4s, v5.4s, v18.4s + orr v19.16b, v19.16b, v25.16b + ext v20.16b, v16.16b, v16.16b, #12 + ext v4.16b, v17.16b, v4.16b, #8 + add v5.4s, v5.4s, v19.4s + uzp1 v21.4s, v16.4s, v20.4s + mov v17.16b, v4.16b + ext v25.16b, v5.16b, v5.16b, #12 + mov v17.s[1], v21.s[2] + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v4.2d, v18.2d + ext v22.16b, v23.16b, v23.16b, #12 + zip2 v26.4s, v18.4s, v4.4s + tbl v18.16b, { v20.16b, v21.16b }, v2.16b + eor v5.16b, v7.16b, v5.16b + ext v16.16b, v23.16b, v22.16b, #12 + ext v22.16b, v6.16b, v6.16b, #4 + zip1 v27.4s, v26.4s, v21.4s + zip1 v20.4s, v21.4s, v26.4s + ext v21.16b, v18.16b, v18.16b, #12 + tbl v5.16b, { v5.16b }, v1.16b + ext v20.16b, v20.16b, v27.16b, #8 + uzp1 v27.4s, v18.4s, v21.4s + uzp1 v18.4s, v22.4s, v22.4s + add v21.4s, v24.4s, v5.4s + ext v18.16b, v18.16b, v22.16b, #8 + eor v19.16b, v19.16b, v21.16b + tbl v7.16b, { v16.16b, v17.16b }, v3.16b + uzp2 v18.4s, v18.4s, v17.4s + zip2 v16.4s, v16.4s, v20.4s + ushr v17.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v17.16b, v19.16b, v17.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v19.4s, v25.4s, v17.4s + eor v5.16b, v19.16b, v5.16b + ext v21.16b, v21.16b, v21.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v4.4s, v19.4s, v4.4s + add v19.4s, v21.4s, v5.4s + eor v17.16b, v17.16b, v19.16b + ushr v21.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v21.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v4.4s, v4.4s, v6.4s + add v6.4s, v19.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v17.16b, v19.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v17.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v7.4s + orr v17.16b, v17.16b, v19.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + mov v29.16b, v20.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v29.s[1], v27.s[2] + add v4.4s, v4.4s, v27.4s + zip1 v26.2d, v20.2d, v7.2d + zip1 v7.4s, v16.4s, v27.4s + zip1 v16.4s, v27.4s, v16.4s + eor v17.16b, v17.16b, v6.16b + ext v7.16b, v16.16b, v7.16b, #8 + ushr v16.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v16.16b, v17.16b, v16.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + ushr v17.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v20.4s + orr v16.16b, v16.16b, v17.16b + add v4.4s, v4.4s, v16.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + add v4.4s, v4.4s, v18.4s + ushr v17.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + ext v23.16b, v22.16b, v22.16b, #12 + ext v4.16b, v4.16b, v4.16b, #4 + orr v16.16b, v16.16b, v17.16b + ext v28.16b, v22.16b, v23.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v16.4s, v4.4s + tbl v3.16b, { v28.16b, v29.16b }, v3.16b + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v4.4s, v3.4s + tbl v4.16b, { v5.16b }, v0.16b + add v5.4s, v6.4s, v4.4s + eor v6.16b, v16.16b, v5.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v16.16b + tbl v2.16b, { v26.16b, v27.16b }, v2.16b + add v3.4s, v3.4s, v6.4s + ext v19.16b, v2.16b, v2.16b, #12 + eor v4.16b, v4.16b, v3.16b + uzp1 v2.4s, v2.4s, v19.4s + ext v3.16b, v3.16b, v3.16b, #12 + tbl v4.16b, { v4.16b }, v1.16b + add v2.4s, v3.4s, v2.4s + add v3.4s, v5.4s, v4.4s + eor v5.16b, v6.16b, v3.16b + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v6.16b + ext v4.16b, v4.16b, v4.16b, #8 + add v2.4s, v2.4s, v5.4s + eor v4.16b, v2.16b, v4.16b + ext v3.16b, v3.16b, v3.16b, #4 + tbl v0.16b, { v4.16b }, v0.16b + add v3.4s, v3.4s, v0.4s + eor v4.16b, v5.16b, v3.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v2.4s, v2.4s, v7.4s + orr v4.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v4.4s + eor v0.16b, v0.16b, v2.16b + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v3.4s, v0.4s + eor v3.16b, v4.16b, v1.16b + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + ext v0.16b, v0.16b, v0.16b, #8 + ext v1.16b, v1.16b, v1.16b, #12 + orr v3.16b, v3.16b, v4.16b + eor v2.16b, v2.16b, v1.16b + eor v3.16b, v3.16b, v0.16b + stp q2, q3, [x5] + ldr q2, [x0] + eor v1.16b, v2.16b, v1.16b + str q1, [x5, #32] + ldr q1, [x0, #16] + eor v0.16b, v1.16b, v0.16b + str q0, [x5, #48] + ret +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_0: + .word 0 + .word 1 + .word 2 + .word 3 +.LCPI2_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI2_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 + .text + .globl zfs_blake3_hash_many_sse41 + .p2align 2 + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: + .cfi_startproc + stp d15, d14, [sp, #-160]! + stp d13, d12, [sp, #16] + stp d11, d10, [sp, #32] + stp d9, d8, [sp, #48] + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + mov x29, sp + sub sp, sp, #448 + .cfi_def_cfa w29, 160 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + ldr x26, [x29, #168] + ldrb w27, [x29, #160] + mov w19, w6 + mov x20, x4 + mov x22, x2 + mov x28, x1 + cmp x1, #4 + mov x24, x0 + str x3, [sp, #40] + b.lo .LBB2_8 + adrp x11, .LCPI2_0 + ldr q0, [x11, :lo12:.LCPI2_0] + sbfx w13, w5, #0, #1 + dup v1.4s, w13 + mov w10, #58983 + mov w11, #44677 + mov w12, #62322 + and v0.16b, v1.16b, v0.16b + mov w13, #62778 + orr w8, w7, w19 + adrp x9, .LCPI2_1 + movk w10, #27145, lsl #16 + movk w11, #47975, lsl #16 + movk w12, #15470, lsl #16 + movk w13, #42319, lsl #16 + str q0, [sp, #16] + orr v0.4s, #128, lsl #24 + adrp x14, .LCPI2_2 + str q0, [sp] +.LBB2_2: + ldr x2, [sp, #40] + mov x15, x2 + ld1r { v7.4s }, [x15], #4 + add x16, x2, #8 + add x17, x2, #12 + add x18, x2, #16 + add x0, x2, #20 + add x3, x2, #24 + add x2, x2, #28 + ld1r { v6.4s }, [x16] + ld1r { v17.4s }, [x17] + ld1r { v10.4s }, [x18] + ld1r { v11.4s }, [x0] + ld1r { v19.4s }, [x3] + ld1r { v18.4s }, [x15] + ld1r { v16.4s }, [x2] + cbz x22, .LBB2_7 + ldr q1, [sp, #16] + dup v0.4s, w20 + ldp x15, x16, [x24] + ldp x17, x18, [x24, #16] + add v1.4s, v0.4s, v1.4s + movi v0.4s, #128, lsl #24 + str q1, [sp, #64] + eor v0.16b, v1.16b, v0.16b + ldr q1, [sp] + lsr x2, x20, #32 + mov x0, xzr + mov w6, w8 + cmgt v0.4s, v1.4s, v0.4s + dup v1.4s, w2 + sub v0.4s, v1.4s, v0.4s + str q0, [sp, #48] +.LBB2_4: + mov w4, #16 + stp q16, q17, [sp, #192] + bfi x4, x0, #6, #58 + ldr q1, [x15, x4] + ldr q3, [x16, x4] + ldr q2, [x17, x4] + ldr q4, [x18, x4] + mov w4, #32 + bfi x4, x0, #6, #58 + ldr q5, [x15, x4] + ldr q20, [x16, x4] + ldr q21, [x17, x4] + ldr q22, [x18, x4] + mov w4, #48 + lsl x3, x0, #6 + bfi x4, x0, #6, #58 + add x0, x0, #1 + ldr q0, [x15, x3] + ldr q23, [x16, x3] + ldr q16, [x17, x3] + ldr q17, [x18, x3] + cmp x0, x22 + ldr q25, [x15, x4] + ldr q14, [x16, x4] + ldr q28, [x17, x4] + ldr q31, [x18, x4] + csel w4, w27, wzr, eq + orr w4, w4, w6 + mov x2, xzr + and w6, w4, #0xff + add x3, x3, #256 +.LBB2_5: + ldr x4, [x24, x2] + add x2, x2, #8 + cmp x2, #32 + add x4, x4, x3 + prfm pldl1keep, [x4] + b.ne .LBB2_5 + zip1 v29.4s, v0.4s, v23.4s + zip2 v23.4s, v0.4s, v23.4s + zip1 v0.4s, v16.4s, v17.4s + zip2 v24.4s, v16.4s, v17.4s + zip1 v9.4s, v1.4s, v3.4s + zip2 v26.4s, v1.4s, v3.4s + zip1 v27.4s, v2.4s, v4.4s + zip2 v17.4s, v2.4s, v4.4s + zip1 v12.4s, v21.4s, v22.4s + zip2 v13.4s, v21.4s, v22.4s + add v2.4s, v7.4s, v10.4s + add v1.4s, v18.4s, v11.4s + ext v7.16b, v0.16b, v29.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + zip1 v30.4s, v5.4s, v20.4s + zip2 v20.4s, v5.4s, v20.4s + stp q1, q2, [sp, #112] + ext v2.16b, v29.16b, v7.16b, #8 + mov v29.d[1], v0.d[0] + ext v18.16b, v23.16b, v22.16b, #8 + mov v23.d[1], v24.d[0] + zip1 v21.4s, v25.4s, v14.4s + zip2 v4.4s, v25.4s, v14.4s + zip1 v14.4s, v28.4s, v31.4s + zip2 v15.4s, v28.4s, v31.4s + add v8.4s, v6.4s, v19.4s + ext v28.16b, v27.16b, v9.16b, #8 + ext v31.16b, v17.16b, v26.16b, #8 + stur q2, [x29, #-208] + mov v7.16b, v29.16b + ext v0.16b, v12.16b, v30.16b, #8 + stp q23, q29, [x29, #-80] + mov v2.16b, v19.16b + ext v19.16b, v13.16b, v20.16b, #8 + mov v29.16b, v9.16b + ext v25.16b, v9.16b, v28.16b, #8 + mov v29.d[1], v27.d[0] + ext v24.16b, v26.16b, v31.16b, #8 + mov v26.d[1], v17.d[0] + ext v17.16b, v15.16b, v4.16b, #8 + ext v27.16b, v30.16b, v0.16b, #8 + ext v0.16b, v20.16b, v19.16b, #8 + stp q0, q25, [sp, #80] + ext v0.16b, v4.16b, v17.16b, #8 + str q0, [sp, #224] + ldr q0, [sp, #128] + mov v6.16b, v23.16b + mov v22.16b, v4.16b + ldr q16, [x9, :lo12:.LCPI2_1] + add v17.4s, v0.4s, v7.4s + ldr q0, [sp, #112] + mov v30.d[1], v12.d[0] + add v7.4s, v8.4s, v29.4s + mov v20.d[1], v13.d[0] + add v4.4s, v0.4s, v6.4s + ldr q0, [sp, #64] + dup v3.4s, w12 + ext v28.16b, v14.16b, v21.16b, #8 + dup v1.4s, w10 + eor v19.16b, v17.16b, v0.16b + ldr q0, [sp, #48] + ext v23.16b, v21.16b, v28.16b, #8 + mov v21.d[1], v14.d[0] + tbl v14.16b, { v19.16b }, v16.16b + eor v12.16b, v4.16b, v0.16b + movi v0.4s, #64 + eor v13.16b, v7.16b, v0.16b + tbl v13.16b, { v13.16b }, v16.16b + add v6.4s, v13.4s, v3.4s + dup v5.4s, w11 + tbl v12.16b, { v12.16b }, v16.16b + add v1.4s, v14.4s, v1.4s + eor v9.16b, v6.16b, v2.16b + ldp q2, q0, [sp, #192] + add v5.4s, v12.4s, v5.4s + eor v19.16b, v1.16b, v10.16b + eor v10.16b, v5.16b, v11.16b + ushr v11.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v11.16b, v19.16b, v11.16b + ushr v19.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + mov v22.d[1], v15.d[0] + orr v10.16b, v10.16b, v19.16b + ushr v19.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + add v15.4s, v0.4s, v2.4s + orr v9.16b, v9.16b, v19.16b + dup v19.4s, w6 + add v15.4s, v15.4s, v26.4s + eor v19.16b, v15.16b, v19.16b + tbl v3.16b, { v19.16b }, v16.16b + dup v19.4s, w13 + add v8.4s, v3.4s, v19.4s + ldur q31, [x29, #-208] + eor v19.16b, v8.16b, v2.16b + ushr v0.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v2.16b, v19.16b, v0.16b + ldr q19, [x14, :lo12:.LCPI2_2] + add v17.4s, v17.4s, v31.4s + add v17.4s, v17.4s, v11.4s + eor v14.16b, v14.16b, v17.16b + tbl v14.16b, { v14.16b }, v19.16b + add v1.4s, v1.4s, v14.4s + eor v11.16b, v1.16b, v11.16b + add v4.4s, v4.4s, v18.4s + ushr v0.4s, v11.4s, #7 + shl v11.4s, v11.4s, #25 + add v4.4s, v4.4s, v10.4s + orr v0.16b, v11.16b, v0.16b + eor v11.16b, v12.16b, v4.16b + tbl v11.16b, { v11.16b }, v19.16b + add v5.4s, v5.4s, v11.4s + eor v10.16b, v5.16b, v10.16b + add v7.4s, v7.4s, v25.4s + ushr v12.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + add v7.4s, v7.4s, v9.4s + orr v10.16b, v10.16b, v12.16b + eor v12.16b, v13.16b, v7.16b + tbl v12.16b, { v12.16b }, v19.16b + add v6.4s, v6.4s, v12.4s + eor v9.16b, v6.16b, v9.16b + ushr v13.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + orr v9.16b, v9.16b, v13.16b + add v13.4s, v15.4s, v24.4s + add v13.4s, v13.4s, v2.4s + eor v3.16b, v3.16b, v13.16b + tbl v3.16b, { v3.16b }, v19.16b + add v8.4s, v8.4s, v3.4s + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v30.4s + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v21.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v22.4s + mov v28.16b, v26.16b + stur q26, [x29, #-112] + mov v26.16b, v18.16b + mov v18.16b, v24.16b + stur q24, [x29, #-160] + add v6.4s, v6.4s, v3.4s + mov v24.16b, v20.16b + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldr q20, [sp, #80] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + stp q30, q22, [x29, #-192] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + mov v30.16b, v27.16b + add v17.4s, v17.4s, v27.4s + ldr q27, [sp, #224] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v23.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v27.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + stur q21, [x29, #-144] + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + ldur q21, [x29, #-80] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v26.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v18.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v29.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-64] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v28.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v24.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v23.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-144] + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v31.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v27.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldr q27, [sp, #96] + mov v21.16b, v26.16b + stur q26, [x29, #-96] + mov v28.16b, v31.16b + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldp q31, q26, [x29, #-192] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v20.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v27.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v26.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v31.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + mov v18.16b, v24.16b + mov v24.16b, v20.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ldur q20, [x29, #-160] + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v18.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v23.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v20.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q25, [x29, #-80] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v29.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v25.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v26.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + ldur q25, [x29, #-112] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v30.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v24.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v31.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q25, [x29, #-64] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldr q31, [sp, #224] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v27.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v25.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v31.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v28.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v26.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v23.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + mov v21.16b, v29.16b + stur q29, [x29, #-128] + mov v29.16b, v30.16b + mov v30.16b, v27.16b + mov v27.16b, v18.16b + str q18, [sp, #176] + eor v0.16b, v0.16b, v1.16b + mov v18.16b, v22.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-96] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v20.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v29.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v31.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v21.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v24.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v28.4s + add v6.4s, v6.4s, v3.4s + mov v22.16b, v24.16b + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q24, [x29, #-80] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + mov v21.16b, v30.16b + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldur q30, [x29, #-192] + mov v20.16b, v29.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + ldur q29, [x29, #-112] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v24.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v29.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v20.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v31.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v26.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v23.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v27.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v30.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + ldur q27, [x29, #-160] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v27.4s + mov v28.16b, v25.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v21.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v28.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v29.4s + mov v25.16b, v31.16b + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q31, [x29, #-96] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldur q28, [x29, #-208] + mov v18.16b, v20.16b + str q20, [sp, #144] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + ldur q20, [x29, #-128] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v24.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v31.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v28.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v20.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v30.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v25.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + add v17.4s, v17.4s, v26.4s + mov v26.16b, v21.16b + add v4.4s, v4.4s, v21.4s + ldur q21, [x29, #-144] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v21.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v28.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + str q23, [sp, #160] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + add v17.4s, v17.4s, v23.4s + ldur q23, [x29, #-64] + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v23.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v24.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v20.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + add v13.4s, v13.4s, v0.4s + ldr q20, [sp, #176] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + eor v12.16b, v12.16b, v13.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + tbl v12.16b, { v12.16b }, v16.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v31.4s + orr v2.16b, v2.16b, v15.16b + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + add v7.4s, v7.4s, v29.4s + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v2.4s + orr v0.16b, v0.16b, v15.16b + mov v15.16b, v31.16b + add v17.4s, v17.4s, v22.4s + eor v31.16b, v14.16b, v4.16b + eor v22.16b, v11.16b, v7.16b + add v11.4s, v13.4s, v27.4s + tbl v3.16b, { v3.16b }, v19.16b + add v11.4s, v11.4s, v0.4s + tbl v31.16b, { v31.16b }, v19.16b + add v6.4s, v6.4s, v3.4s + eor v12.16b, v12.16b, v11.16b + tbl v22.16b, { v22.16b }, v19.16b + add v8.4s, v8.4s, v31.4s + eor v10.16b, v6.16b, v10.16b + add v30.4s, v11.4s, v30.4s + tbl v11.16b, { v12.16b }, v19.16b + add v1.4s, v1.4s, v22.4s + eor v9.16b, v8.16b, v9.16b + ushr v12.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + add v5.4s, v5.4s, v11.4s + eor v2.16b, v1.16b, v2.16b + orr v10.16b, v10.16b, v12.16b + ushr v12.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v9.16b, v9.16b, v12.16b + ushr v12.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v12.16b + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v12.16b + add v4.4s, v4.4s, v26.4s + add v17.4s, v17.4s, v0.4s + add v7.4s, v7.4s, v28.4s + mov v18.16b, v27.16b + eor v31.16b, v31.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v27.4s, v30.4s, v2.4s + eor v22.16b, v22.16b, v4.16b + add v7.4s, v7.4s, v9.4s + eor v3.16b, v3.16b, v27.16b + add v26.4s, v27.4s, v29.4s + tbl v27.16b, { v31.16b }, v16.16b + eor v28.16b, v11.16b, v7.16b + tbl v22.16b, { v22.16b }, v16.16b + add v1.4s, v1.4s, v27.4s + add v4.4s, v4.4s, v23.4s + ldr q23, [sp, #144] + tbl v28.16b, { v28.16b }, v16.16b + tbl v3.16b, { v3.16b }, v16.16b + add v5.4s, v5.4s, v22.4s + eor v0.16b, v0.16b, v1.16b + add v6.4s, v6.4s, v28.4s + add v29.4s, v8.4s, v3.4s + eor v30.16b, v5.16b, v10.16b + ushr v8.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v31.16b, v6.16b, v9.16b + orr v0.16b, v0.16b, v8.16b + ushr v8.4s, v30.4s, #12 + shl v30.4s, v30.4s, #20 + eor v2.16b, v29.16b, v2.16b + orr v30.16b, v30.16b, v8.16b + ushr v8.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + add v17.4s, v17.4s, v25.4s + add v7.4s, v7.4s, v23.4s + orr v31.16b, v31.16b, v8.16b + ushr v8.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ldur q23, [x29, #-176] + orr v2.16b, v2.16b, v8.16b + add v17.4s, v17.4s, v0.4s + eor v27.16b, v27.16b, v17.16b + add v4.4s, v4.4s, v30.4s + add v25.4s, v26.4s, v2.4s + eor v22.16b, v22.16b, v4.16b + add v4.4s, v4.4s, v24.4s + add v7.4s, v7.4s, v31.4s + eor v3.16b, v3.16b, v25.16b + add v24.4s, v25.4s, v18.4s + tbl v25.16b, { v27.16b }, v19.16b + add v17.4s, v17.4s, v23.4s + eor v23.16b, v28.16b, v7.16b + tbl v22.16b, { v22.16b }, v19.16b + add v1.4s, v1.4s, v25.4s + tbl v23.16b, { v23.16b }, v19.16b + tbl v3.16b, { v3.16b }, v19.16b + add v5.4s, v5.4s, v22.4s + eor v0.16b, v0.16b, v1.16b + add v6.4s, v6.4s, v23.4s + add v26.4s, v29.4s, v3.4s + eor v27.16b, v5.16b, v30.16b + ushr v29.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v28.16b, v6.16b, v31.16b + orr v0.16b, v0.16b, v29.16b + ushr v29.4s, v27.4s, #7 + shl v27.4s, v27.4s, #25 + eor v2.16b, v26.16b, v2.16b + orr v27.16b, v27.16b, v29.16b + ushr v29.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + ldur q18, [x29, #-128] + orr v28.16b, v28.16b, v29.16b + ushr v29.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v7.4s, v7.4s, v15.4s + orr v2.16b, v2.16b, v29.16b + add v17.4s, v17.4s, v27.4s + add v4.4s, v4.4s, v28.4s + add v7.4s, v7.4s, v2.4s + eor v3.16b, v3.16b, v17.16b + add v17.4s, v17.4s, v20.4s + eor v20.16b, v25.16b, v4.16b + add v4.4s, v4.4s, v21.4s + eor v21.16b, v22.16b, v7.16b + add v7.4s, v7.4s, v18.4s + add v18.4s, v24.4s, v0.4s + eor v22.16b, v23.16b, v18.16b + ldr q23, [sp, #160] + tbl v3.16b, { v3.16b }, v16.16b + tbl v20.16b, { v20.16b }, v16.16b + add v6.4s, v6.4s, v3.4s + add v18.4s, v18.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + tbl v16.16b, { v22.16b }, v16.16b + add v22.4s, v26.4s, v20.4s + eor v23.16b, v6.16b, v27.16b + add v1.4s, v1.4s, v21.4s + eor v24.16b, v22.16b, v28.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v5.4s, v5.4s, v16.4s + eor v2.16b, v1.16b, v2.16b + orr v23.16b, v23.16b, v25.16b + ushr v25.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + eor v0.16b, v5.16b, v0.16b + orr v24.16b, v24.16b, v25.16b + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + ushr v25.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + orr v0.16b, v0.16b, v25.16b + add v25.4s, v7.4s, v2.4s + add v26.4s, v18.4s, v0.4s + eor v18.16b, v21.16b, v25.16b + add v17.4s, v17.4s, v23.4s + add v4.4s, v4.4s, v24.4s + eor v16.16b, v16.16b, v26.16b + tbl v21.16b, { v18.16b }, v19.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v20.16b, v4.16b + tbl v16.16b, { v16.16b }, v19.16b + add v1.4s, v1.4s, v21.4s + tbl v3.16b, { v3.16b }, v19.16b + tbl v20.16b, { v7.16b }, v19.16b + eor v2.16b, v1.16b, v2.16b + eor v7.16b, v1.16b, v17.16b + add v1.4s, v5.4s, v16.4s + eor v0.16b, v1.16b, v0.16b + eor v18.16b, v1.16b, v4.16b + add v1.4s, v6.4s, v3.4s + eor v4.16b, v1.16b, v23.16b + eor v6.16b, v25.16b, v1.16b + add v1.4s, v22.4s, v20.4s + eor v5.16b, v1.16b, v24.16b + eor v17.16b, v26.16b, v1.16b + ushr v1.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + orr v1.16b, v4.16b, v1.16b + ushr v4.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v4.16b, v5.16b, v4.16b + ushr v5.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v5.16b + ushr v5.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v5.16b + eor v10.16b, v0.16b, v20.16b + eor v11.16b, v1.16b, v21.16b + eor v19.16b, v4.16b, v16.16b + cmp x0, x22 + eor v16.16b, v2.16b, v3.16b + mov w6, w19 + b.ne .LBB2_4 +.LBB2_7: + zip1 v0.4s, v7.4s, v18.4s + zip2 v1.4s, v7.4s, v18.4s + zip1 v2.4s, v6.4s, v17.4s + zip2 v3.4s, v6.4s, v17.4s + zip1 v4.4s, v10.4s, v11.4s + zip2 v5.4s, v10.4s, v11.4s + zip1 v6.4s, v19.4s, v16.4s + zip2 v7.4s, v19.4s, v16.4s + add x15, x20, #4 + tst w5, #0x1 + sub x28, x28, #4 + zip1 v16.2d, v0.2d, v2.2d + zip2 v0.2d, v0.2d, v2.2d + zip1 v2.2d, v1.2d, v3.2d + zip2 v1.2d, v1.2d, v3.2d + zip1 v3.2d, v4.2d, v6.2d + zip2 v4.2d, v4.2d, v6.2d + zip1 v6.2d, v5.2d, v7.2d + zip2 v5.2d, v5.2d, v7.2d + add x24, x24, #32 + csel x20, x15, x20, ne + cmp x28, #3 + stp q16, q3, [x26] + stp q0, q4, [x26, #32] + stp q2, q6, [x26, #64] + stp q1, q5, [x26, #96] + add x26, x26, #128 + b.hi .LBB2_2 +.LBB2_8: + cbz x28, .LBB2_16 + orr w8, w7, w19 + and x21, x5, #0x1 + stur w8, [x29, #-64] +.LBB2_10: + ldr x8, [sp, #40] + ldr x25, [x24] + ldur w4, [x29, #-64] + ldp q1, q0, [x8] + mov x8, x22 + stp q1, q0, [x29, #-48] +.LBB2_11: + subs x23, x8, #1 + b.eq .LBB2_13 + cbnz x8, .LBB2_14 + b .LBB2_15 +.LBB2_13: + orr w4, w4, w27 +.LBB2_14: + sub x0, x29, #48 + mov w2, #64 + mov x1, x25 + mov x3, x20 + bl zfs_blake3_compress_in_place_sse41 + add x25, x25, #64 + mov x8, x23 + mov w4, w19 + b .LBB2_11 +.LBB2_15: + ldp q0, q1, [x29, #-48] + add x20, x20, x21 + add x24, x24, #8 + subs x28, x28, #1 + stp q0, q1, [x26], #32 + b.ne .LBB2_10 +.LBB2_16: + add sp, sp, #448 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] + ldp d9, d8, [sp, #48] + ldp d11, d10, [sp, #32] + ldp d13, d12, [sp, #16] + ldp d15, d14, [sp], #160 + ret +.Lfunc_end2: + .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S new file mode 100644 index 000000000..9deba202f --- /dev/null +++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S @@ -0,0 +1,2823 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE2 -> POWER8 PPC64 Little Endian + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + .text + .abiversion 2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI0_1: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI0_2: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_3: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_4: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_5: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_6: + .short 1 + .short 2 + .short 4 + .short 8 + .short 16 + .short 32 + .short 64 + .short 128 +.LCPI0_7: + .short 0 + .short 0 + .short 4 + .short 8 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI0_8: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_9: + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI0_10: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_12: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_13: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_14: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .text + .globl zfs_blake3_compress_in_place_sse2 + .p2align 2 + .type zfs_blake3_compress_in_place_sse2,@function +zfs_blake3_compress_in_place_sse2: +.Lfunc_begin0: + .cfi_startproc +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry zfs_blake3_compress_in_place_sse2, .Lfunc_lep0-.Lfunc_gep0 + li 8, -64 + mtvsrd 35, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 12, 9 + stxvd2x 60, 1, 8 + li 8, -48 + mtvsrd 36, 7 + lfd 2, 16(4) + stxvd2x 61, 1, 8 + li 8, -32 + lfd 1, 8(4) + mtvsrwz 37, 6 + rldicl 6, 6, 32, 32 + addis 7, 2, .LCPI0_2@toc@ha + stxvd2x 62, 1, 8 + li 8, -16 + addi 7, 7, .LCPI0_2@toc@l + stxvd2x 63, 1, 8 + li 8, 0 + lvx 9, 0, 7 + li 7, 48 + mtvsrd 34, 8 + xxmrghd 32, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + lfd 3, 24(4) + addis 8, 2, .LCPI0_5@toc@ha + vmrghb 3, 2, 3 + addi 8, 8, .LCPI0_5@toc@l + vmrghb 4, 2, 4 + vspltb 2, 2, 7 + xxmrghd 33, 3, 2 + vpkudum 7, 1, 0 + vmrglh 3, 2, 3 + vmrglh 2, 2, 4 + mtvsrwz 36, 6 + addis 6, 2, .LCPI0_0@toc@ha + addi 6, 6, .LCPI0_0@toc@l + vperm 10, 1, 0, 9 + vmrghw 4, 4, 5 + xxswapd 37, 1 + lxvd2x 1, 4, 7 + addis 7, 2, .LCPI0_8@toc@ha + addi 7, 7, .LCPI0_8@toc@l + vmrglw 2, 2, 3 + xxswapd 35, 0 + xxswapd 41, 1 + xxspltd 62, 42, 1 + vadduwm 3, 7, 3 + vadduwm 6, 3, 5 + xxmrgld 36, 34, 36 + lvx 2, 0, 6 + addis 6, 2, .LCPI0_1@toc@ha + addi 6, 6, .LCPI0_1@toc@l + xxlxor 35, 38, 36 + lvx 4, 0, 6 + li 6, 32 + lxvd2x 0, 4, 6 + addis 4, 2, .LCPI0_3@toc@ha + addis 6, 2, .LCPI0_7@toc@ha + vperm 8, 3, 3, 2 + vspltisw 3, 10 + addi 4, 4, .LCPI0_3@toc@l + addi 6, 6, .LCPI0_7@toc@l + vadduwm 3, 3, 3 + vadduwm 11, 8, 4 + xxlxor 36, 43, 37 + vadduwm 5, 6, 10 + vrlw 0, 4, 3 + vspltisw 4, 12 + vadduwm 4, 4, 4 + vadduwm 1, 0, 5 + xxlxor 37, 33, 40 + xxswapd 40, 0 + vrlw 6, 5, 4 + vspltisw 5, -16 + vpkudum 13, 9, 8 + vsubuwm 5, 12, 5 + lvx 12, 0, 4 + addis 4, 2, .LCPI0_4@toc@ha + addi 4, 4, .LCPI0_4@toc@l + vadduwm 11, 6, 11 + xxswapd 0, 38 + vadduwm 1, 1, 13 + xxsldwi 50, 45, 45, 1 + xxlxor 32, 43, 32 + xxsldwi 43, 43, 43, 3 + xxsldwi 33, 33, 33, 1 + vperm 12, 8, 9, 12 + vrlw 0, 0, 5 + vadduwm 1, 0, 1 + xxlxor 38, 33, 0 + vadduwm 1, 1, 12 + vperm 6, 6, 6, 2 + vadduwm 15, 6, 11 + lvx 11, 0, 4 + addis 4, 2, .LCPI0_6@toc@ha + addi 4, 4, .LCPI0_6@toc@l + xxlxor 32, 47, 32 + lvx 17, 0, 4 + addis 4, 2, .LCPI0_9@toc@ha + vperm 14, 10, 7, 11 + addi 4, 4, .LCPI0_9@toc@l + vrlw 0, 0, 3 + vadduwm 1, 0, 1 + xxlxor 38, 33, 38 + vrlw 6, 6, 4 + vadduwm 8, 6, 15 + xxswapd 0, 38 + lvx 6, 0, 8 + xxlxor 32, 40, 32 + xxsldwi 40, 40, 40, 1 + vperm 13, 12, 18, 6 + vrlw 9, 0, 5 + vadduwm 0, 1, 14 + lvx 1, 0, 7 + xxsldwi 46, 46, 46, 3 + xxsldwi 32, 32, 32, 3 + vperm 7, 7, 7, 1 + vadduwm 15, 9, 0 + xxlxor 32, 47, 0 + vperm 16, 0, 0, 2 + lvx 0, 0, 6 + addis 6, 2, .LCPI0_10@toc@ha + vcmpequh 0, 0, 17 + vadduwm 19, 16, 8 + xxlxor 40, 51, 41 + xxsel 45, 39, 45, 32 + vrlw 31, 8, 3 + lvx 8, 0, 4 + addis 4, 2, .LCPI0_11@toc@ha + addi 4, 4, .LCPI0_11@toc@l + vcmpequh 7, 8, 17 + vadduwm 8, 15, 13 + vadduwm 15, 31, 8 + lvx 8, 0, 4 + addi 4, 6, .LCPI0_10@toc@l + lvx 17, 0, 4 + addis 4, 2, .LCPI0_12@toc@ha + xxlxor 41, 47, 48 + xxsldwi 47, 47, 47, 1 + addi 4, 4, .LCPI0_12@toc@l + xxlnor 48, 39, 39 + vrlw 29, 9, 4 + vperm 9, 16, 16, 8 + xxland 48, 50, 39 + vperm 17, 30, 12, 17 + vperm 16, 16, 16, 8 + vmrghw 12, 12, 10 + lvx 10, 0, 4 + addis 4, 2, .LCPI0_13@toc@ha + vadduwm 19, 29, 19 + addi 4, 4, .LCPI0_13@toc@l + xxlxor 63, 51, 63 + xxsldwi 51, 51, 51, 3 + xxland 0, 49, 41 + vrlw 17, 31, 5 + xxlor 48, 0, 48 + xxswapd 0, 61 + vperm 18, 12, 18, 10 + vadduwm 15, 15, 16 + xxland 60, 48, 39 + vadduwm 15, 17, 15 + vperm 28, 28, 28, 8 + xxlxor 63, 47, 0 + vadduwm 15, 15, 18 + vperm 31, 31, 31, 2 + vperm 30, 18, 16, 6 + vadduwm 19, 31, 19 + xxlxor 44, 51, 49 + vrlw 12, 12, 3 + vadduwm 15, 12, 15 + xxlxor 49, 47, 63 + vperm 31, 13, 14, 11 + vrlw 17, 17, 4 + vperm 14, 14, 14, 1 + vadduwm 15, 15, 31 + vadduwm 19, 17, 19 + xxswapd 0, 49 + xxsldwi 47, 47, 47, 3 + xxsel 46, 46, 62, 32 + xxlxor 44, 51, 44 + xxsldwi 51, 51, 51, 1 + vrlw 12, 12, 5 + vadduwm 15, 12, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 19, 17, 19 + xxlxor 44, 51, 44 + vrlw 29, 12, 3 + vadduwm 12, 15, 14 + vadduwm 15, 29, 12 + lvx 12, 0, 4 + addis 4, 2, .LCPI0_14@toc@ha + addi 4, 4, .LCPI0_14@toc@l + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + vperm 30, 13, 18, 12 + vrlw 17, 17, 4 + vmrghw 13, 18, 13 + xxland 0, 62, 41 + vadduwm 19, 17, 19 + vperm 16, 13, 16, 10 + xxlxor 61, 51, 61 + xxsldwi 50, 51, 51, 3 + xxsldwi 51, 63, 63, 3 + vrlw 30, 29, 5 + xxlor 61, 60, 0 + xxswapd 0, 49 + vperm 31, 14, 19, 11 + vadduwm 15, 15, 29 + vperm 19, 19, 19, 1 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 16 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 62 + vperm 30, 16, 29, 6 + vrlw 13, 13, 3 + vadduwm 15, 13, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 31 + xxsldwi 63, 63, 63, 3 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 45, 50, 45 + xxsldwi 50, 50, 50, 1 + vrlw 13, 13, 5 + vadduwm 15, 13, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 45 + vrlw 28, 13, 3 + xxsel 45, 51, 62, 32 + xxland 51, 61, 39 + vperm 30, 14, 16, 12 + vadduwm 15, 15, 13 + vperm 19, 19, 19, 8 + vmrghw 14, 16, 14 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 62, 41 + vrlw 17, 17, 4 + xxlor 51, 51, 0 + vadduwm 15, 15, 19 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 60, 50, 60 + xxsldwi 48, 50, 50, 3 + vperm 18, 14, 29, 10 + vrlw 30, 28, 5 + vperm 29, 18, 19, 6 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 18 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 62 + vperm 30, 13, 31, 11 + vrlw 14, 14, 3 + vperm 31, 31, 31, 1 + vadduwm 15, 14, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 46, 48, 46 + xxsldwi 48, 48, 48, 1 + vrlw 14, 14, 5 + vadduwm 15, 14, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 46 + vrlw 28, 14, 3 + xxsel 46, 63, 61, 32 + xxland 63, 51, 39 + vperm 29, 13, 18, 12 + vadduwm 15, 15, 14 + vperm 31, 31, 31, 8 + vmrghw 13, 18, 13 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 61, 41 + vrlw 17, 17, 4 + xxlor 63, 63, 0 + vperm 13, 13, 19, 10 + xxsldwi 51, 62, 62, 3 + vadduwm 15, 15, 31 + vperm 30, 14, 19, 11 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 60, 48, 60 + xxsldwi 48, 48, 48, 3 + vrlw 29, 28, 5 + vadduwm 15, 29, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 50, 48, 61 + vrlw 18, 18, 3 + vadduwm 15, 18, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 11, 17, 16 + xxswapd 0, 49 + xxlxor 48, 43, 50 + xxsldwi 43, 43, 43, 1 + vperm 18, 19, 19, 1 + vrlw 16, 16, 5 + vperm 19, 13, 31, 6 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 29, 17, 11 + xxlxor 43, 61, 48 + vrlw 16, 11, 3 + xxsel 43, 50, 51, 32 + xxland 50, 63, 39 + vperm 19, 14, 13, 12 + vadduwm 15, 15, 11 + vperm 18, 18, 18, 8 + vmrghw 13, 13, 14 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 51, 41 + lvx 19, 0, 4 + vrlw 17, 17, 4 + xxlor 50, 50, 0 + vperm 13, 13, 31, 10 + xxsldwi 63, 62, 62, 3 + vadduwm 15, 15, 18 + vperm 19, 11, 31, 19 + vadduwm 29, 17, 29 + xxswapd 0, 49 + vperm 1, 31, 31, 1 + xxlxor 48, 61, 48 + xxsldwi 46, 61, 61, 3 + vperm 6, 13, 18, 6 + vrlw 16, 16, 5 + xxsel 32, 33, 38, 32 + xxland 38, 50, 39 + vadduwm 15, 16, 15 + vperm 7, 11, 13, 12 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vperm 6, 6, 6, 8 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 3 + vrlw 17, 17, 4 + vadduwm 15, 15, 19 + vadduwm 14, 17, 14 + xxswapd 0, 49 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 5 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vadduwm 0, 15, 0 + vperm 17, 17, 17, 2 + xxland 0, 39, 41 + xxlor 38, 38, 0 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 0, 16, 0 + xxlxor 33, 32, 49 + xxsldwi 32, 32, 32, 1 + vrlw 1, 1, 4 + vadduwm 0, 0, 6 + vadduwm 8, 1, 14 + xxswapd 0, 33 + xxlxor 44, 40, 48 + xxsldwi 38, 40, 40, 3 + vrlw 7, 12, 5 + vadduwm 0, 7, 0 + xxlxor 33, 32, 0 + vperm 2, 1, 1, 2 + vmrghw 1, 13, 11 + vadduwm 6, 2, 6 + vperm 1, 1, 18, 10 + xxlxor 39, 38, 39 + vrlw 3, 7, 3 + vadduwm 0, 0, 1 + vadduwm 0, 3, 0 + xxlxor 34, 32, 34 + xxsldwi 0, 32, 32, 3 + vrlw 2, 2, 4 + vadduwm 4, 2, 6 + xxswapd 2, 34 + xxlxor 35, 36, 35 + xxsldwi 1, 36, 36, 1 + vrlw 3, 3, 5 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 1, 35, 2 + stxvd2x 0, 0, 3 + xxswapd 1, 1 + stxvd2x 1, 3, 5 + li 3, -16 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-.Lfunc_begin0 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI1_1: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI1_2: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_3: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_4: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_5: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_6: + .short 1 + .short 2 + .short 4 + .short 8 + .short 16 + .short 32 + .short 64 + .short 128 +.LCPI1_7: + .short 0 + .short 0 + .short 4 + .short 8 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI1_8: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_9: + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI1_10: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_12: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_13: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_14: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .text + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: +.Lfunc_begin1: + .cfi_startproc +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry zfs_blake3_compress_xof_sse2, .Lfunc_lep1-.Lfunc_gep1 + li 9, -80 + mtvsrd 35, 5 + li 5, 16 + lfdx 0, 0, 4 + addis 10, 2, .LCPI1_2@toc@ha + vspltisw 12, 9 + std 30, -16(1) + addis 12, 2, .LCPI1_8@toc@ha + addis 30, 2, .LCPI1_5@toc@ha + addis 11, 2, .LCPI1_7@toc@ha + stxvd2x 60, 1, 9 + li 9, -64 + mtvsrd 36, 7 + lfd 2, 16(4) + addi 10, 10, .LCPI1_2@toc@l + addi 12, 12, .LCPI1_8@toc@l + addi 11, 11, .LCPI1_7@toc@l + stxvd2x 61, 1, 9 + li 9, -48 + lfd 3, 24(4) + mtvsrwz 37, 6 + rldicl 6, 6, 32, 32 + lvx 9, 0, 10 + stxvd2x 62, 1, 9 + li 9, -32 + li 10, 32 + stxvd2x 63, 1, 9 + li 9, 0 + mtvsrd 34, 9 + xxmrghd 33, 3, 2 + lfd 1, 8(4) + vmrghb 3, 2, 3 + vmrghb 4, 2, 4 + vspltb 2, 2, 7 + xxmrghd 32, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + vpkudum 7, 1, 0 + vmrglh 3, 2, 3 + vmrglh 2, 2, 4 + mtvsrwz 36, 6 + addis 6, 2, .LCPI1_0@toc@ha + addi 6, 6, .LCPI1_0@toc@l + vperm 10, 1, 0, 9 + vmrghw 4, 4, 5 + xxswapd 37, 1 + vmrglw 2, 2, 3 + xxswapd 35, 0 + lxvd2x 0, 4, 10 + xxspltd 62, 42, 1 + vadduwm 3, 7, 3 + vadduwm 6, 3, 5 + xxmrgld 36, 34, 36 + lvx 2, 0, 6 + addis 6, 2, .LCPI1_1@toc@ha + addi 6, 6, .LCPI1_1@toc@l + xxlxor 35, 38, 36 + lvx 4, 0, 6 + li 6, 48 + lxvd2x 1, 4, 6 + addis 4, 2, .LCPI1_3@toc@ha + vperm 8, 3, 3, 2 + vspltisw 3, 10 + addi 4, 4, .LCPI1_3@toc@l + xxswapd 41, 1 + vadduwm 3, 3, 3 + vadduwm 11, 8, 4 + xxlxor 36, 43, 37 + vadduwm 5, 6, 10 + vrlw 0, 4, 3 + vspltisw 4, 12 + vadduwm 4, 4, 4 + vadduwm 1, 0, 5 + xxlxor 37, 33, 40 + xxswapd 40, 0 + vrlw 6, 5, 4 + vspltisw 5, -16 + vpkudum 13, 9, 8 + vsubuwm 5, 12, 5 + lvx 12, 0, 4 + addis 4, 2, .LCPI1_4@toc@ha + addi 4, 4, .LCPI1_4@toc@l + vadduwm 11, 6, 11 + xxswapd 0, 38 + vadduwm 1, 1, 13 + xxsldwi 50, 45, 45, 1 + xxlxor 32, 43, 32 + xxsldwi 43, 43, 43, 3 + xxsldwi 33, 33, 33, 1 + vperm 12, 8, 9, 12 + vrlw 0, 0, 5 + vadduwm 1, 0, 1 + xxlxor 38, 33, 0 + vadduwm 1, 1, 12 + vperm 6, 6, 6, 2 + vadduwm 15, 6, 11 + lvx 11, 0, 4 + addis 4, 2, .LCPI1_6@toc@ha + addi 4, 4, .LCPI1_6@toc@l + xxlxor 32, 47, 32 + lvx 17, 0, 4 + addi 4, 30, .LCPI1_5@toc@l + vperm 14, 10, 7, 11 + vrlw 0, 0, 3 + vadduwm 1, 0, 1 + xxlxor 38, 33, 38 + vrlw 6, 6, 4 + vadduwm 8, 6, 15 + xxswapd 0, 38 + lvx 6, 0, 4 + addis 4, 2, .LCPI1_9@toc@ha + addi 4, 4, .LCPI1_9@toc@l + xxlxor 32, 40, 32 + xxsldwi 40, 40, 40, 1 + vperm 13, 12, 18, 6 + vrlw 9, 0, 5 + vadduwm 0, 1, 14 + lvx 1, 0, 12 + xxsldwi 46, 46, 46, 3 + xxsldwi 32, 32, 32, 3 + vperm 7, 7, 7, 1 + vadduwm 15, 9, 0 + xxlxor 32, 47, 0 + vperm 16, 0, 0, 2 + lvx 0, 0, 11 + addis 11, 2, .LCPI1_10@toc@ha + vcmpequh 0, 0, 17 + vadduwm 19, 16, 8 + xxlxor 40, 51, 41 + xxsel 45, 39, 45, 32 + vrlw 31, 8, 3 + lvx 8, 0, 4 + addis 4, 2, .LCPI1_11@toc@ha + addi 4, 4, .LCPI1_11@toc@l + vcmpequh 7, 8, 17 + vadduwm 8, 15, 13 + vadduwm 15, 31, 8 + lvx 8, 0, 4 + addi 4, 11, .LCPI1_10@toc@l + lvx 17, 0, 4 + addis 4, 2, .LCPI1_12@toc@ha + xxlxor 41, 47, 48 + xxsldwi 47, 47, 47, 1 + addi 4, 4, .LCPI1_12@toc@l + xxlnor 48, 39, 39 + vrlw 29, 9, 4 + vperm 9, 16, 16, 8 + xxland 48, 50, 39 + vperm 17, 30, 12, 17 + vperm 16, 16, 16, 8 + vmrghw 12, 12, 10 + lvx 10, 0, 4 + addis 4, 2, .LCPI1_13@toc@ha + vadduwm 19, 29, 19 + addi 4, 4, .LCPI1_13@toc@l + xxlxor 63, 51, 63 + xxsldwi 51, 51, 51, 3 + xxland 0, 49, 41 + vrlw 17, 31, 5 + xxlor 48, 0, 48 + xxswapd 0, 61 + vperm 18, 12, 18, 10 + vadduwm 15, 15, 16 + xxland 60, 48, 39 + vadduwm 15, 17, 15 + vperm 28, 28, 28, 8 + xxlxor 63, 47, 0 + vadduwm 15, 15, 18 + vperm 31, 31, 31, 2 + vperm 30, 18, 16, 6 + vadduwm 19, 31, 19 + xxlxor 44, 51, 49 + vrlw 12, 12, 3 + vadduwm 15, 12, 15 + xxlxor 49, 47, 63 + vperm 31, 13, 14, 11 + vrlw 17, 17, 4 + vperm 14, 14, 14, 1 + vadduwm 15, 15, 31 + vadduwm 19, 17, 19 + xxswapd 0, 49 + xxsldwi 47, 47, 47, 3 + xxsel 46, 46, 62, 32 + xxlxor 44, 51, 44 + xxsldwi 51, 51, 51, 1 + vrlw 12, 12, 5 + vadduwm 15, 12, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 19, 17, 19 + xxlxor 44, 51, 44 + vrlw 29, 12, 3 + vadduwm 12, 15, 14 + vadduwm 15, 29, 12 + lvx 12, 0, 4 + addis 4, 2, .LCPI1_14@toc@ha + addi 4, 4, .LCPI1_14@toc@l + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + vperm 30, 13, 18, 12 + vrlw 17, 17, 4 + vmrghw 13, 18, 13 + xxland 0, 62, 41 + vadduwm 19, 17, 19 + vperm 16, 13, 16, 10 + xxlxor 61, 51, 61 + xxsldwi 50, 51, 51, 3 + xxsldwi 51, 63, 63, 3 + vrlw 30, 29, 5 + xxlor 61, 60, 0 + xxswapd 0, 49 + vperm 31, 14, 19, 11 + vadduwm 15, 15, 29 + vperm 19, 19, 19, 1 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 16 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 62 + vperm 30, 16, 29, 6 + vrlw 13, 13, 3 + vadduwm 15, 13, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 31 + xxsldwi 63, 63, 63, 3 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 45, 50, 45 + xxsldwi 50, 50, 50, 1 + vrlw 13, 13, 5 + vadduwm 15, 13, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 45 + vrlw 28, 13, 3 + xxsel 45, 51, 62, 32 + xxland 51, 61, 39 + vperm 30, 14, 16, 12 + vadduwm 15, 15, 13 + vperm 19, 19, 19, 8 + vmrghw 14, 16, 14 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 62, 41 + vrlw 17, 17, 4 + xxlor 51, 51, 0 + vadduwm 15, 15, 19 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 60, 50, 60 + xxsldwi 48, 50, 50, 3 + vperm 18, 14, 29, 10 + vrlw 30, 28, 5 + vperm 29, 18, 19, 6 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 18 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 62 + vperm 30, 13, 31, 11 + vrlw 14, 14, 3 + vperm 31, 31, 31, 1 + vadduwm 15, 14, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 46, 48, 46 + xxsldwi 48, 48, 48, 1 + vrlw 14, 14, 5 + vadduwm 15, 14, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 46 + vrlw 28, 14, 3 + xxsel 46, 63, 61, 32 + xxland 63, 51, 39 + vperm 29, 13, 18, 12 + vadduwm 15, 15, 14 + vperm 31, 31, 31, 8 + vmrghw 13, 18, 13 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 61, 41 + vrlw 17, 17, 4 + xxlor 63, 63, 0 + vperm 13, 13, 19, 10 + xxsldwi 51, 62, 62, 3 + vadduwm 15, 15, 31 + vperm 30, 14, 19, 11 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 60, 48, 60 + xxsldwi 48, 48, 48, 3 + vrlw 29, 28, 5 + vadduwm 15, 29, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 50, 48, 61 + vrlw 18, 18, 3 + vadduwm 15, 18, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 11, 17, 16 + xxswapd 0, 49 + xxlxor 48, 43, 50 + xxsldwi 43, 43, 43, 1 + vperm 18, 19, 19, 1 + vrlw 16, 16, 5 + vperm 19, 13, 31, 6 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 29, 17, 11 + xxlxor 43, 61, 48 + vrlw 16, 11, 3 + xxsel 43, 50, 51, 32 + xxland 50, 63, 39 + vperm 19, 14, 13, 12 + vadduwm 15, 15, 11 + vperm 18, 18, 18, 8 + vmrghw 13, 13, 14 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 51, 41 + lvx 19, 0, 4 + vrlw 17, 17, 4 + xxlor 50, 50, 0 + vperm 13, 13, 31, 10 + xxsldwi 63, 62, 62, 3 + vadduwm 15, 15, 18 + vperm 19, 11, 31, 19 + vadduwm 29, 17, 29 + xxswapd 0, 49 + vperm 1, 31, 31, 1 + xxlxor 48, 61, 48 + xxsldwi 46, 61, 61, 3 + vperm 6, 13, 18, 6 + vrlw 16, 16, 5 + xxsel 32, 33, 38, 32 + xxland 38, 50, 39 + vadduwm 15, 16, 15 + vperm 7, 11, 13, 12 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vperm 6, 6, 6, 8 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 3 + vrlw 17, 17, 4 + vadduwm 15, 15, 19 + vadduwm 14, 17, 14 + xxswapd 0, 49 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 5 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vadduwm 0, 15, 0 + vperm 17, 17, 17, 2 + xxland 0, 39, 41 + xxlor 38, 38, 0 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 0, 16, 0 + xxlxor 33, 32, 49 + xxsldwi 32, 32, 32, 1 + vrlw 1, 1, 4 + vadduwm 0, 0, 6 + vadduwm 8, 1, 14 + xxswapd 0, 33 + xxlxor 44, 40, 48 + xxsldwi 38, 40, 40, 3 + vrlw 7, 12, 5 + vadduwm 0, 7, 0 + xxlxor 33, 32, 0 + vperm 2, 1, 1, 2 + vmrghw 1, 13, 11 + vadduwm 6, 2, 6 + vperm 1, 1, 18, 10 + xxlxor 39, 38, 39 + vrlw 3, 7, 3 + vadduwm 0, 0, 1 + vadduwm 0, 3, 0 + xxlxor 34, 32, 34 + xxsldwi 0, 32, 32, 3 + vrlw 2, 2, 4 + vadduwm 4, 2, 6 + xxswapd 2, 34 + xxlxor 35, 36, 35 + xxsldwi 1, 36, 36, 1 + vrlw 3, 3, 5 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 3, 35, 2 + stxvd2x 0, 0, 8 + xxswapd 3, 3 + stxvd2x 3, 8, 5 + lfdx 0, 0, 3 + lfd 3, 8(3) + xxmrghd 34, 3, 0 + xxlxor 0, 1, 34 + xxswapd 0, 0 + stxvd2x 0, 8, 10 + lfd 0, 16(3) + lfd 1, 24(3) + li 3, -32 + xxmrghd 34, 1, 0 + xxlxor 0, 2, 34 + xxswapd 0, 0 + stxvd2x 0, 8, 6 + lxvd2x 63, 1, 3 + li 3, -48 + ld 30, -16(1) + lxvd2x 62, 1, 3 + li 3, -64 + lxvd2x 61, 1, 3 + li 3, -80 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-.Lfunc_begin1 + .cfi_endproc + + .globl zfs_blake3_hash_many_sse2 + .p2align 2 + .type zfs_blake3_hash_many_sse2,@function +zfs_blake3_hash_many_sse2: +.Lfunc_begin2: + .cfi_startproc +.Lfunc_gep2: + addis 2, 12, .TOC.-.Lfunc_gep2@ha + addi 2, 2, .TOC.-.Lfunc_gep2@l +.Lfunc_lep2: + .localentry zfs_blake3_hash_many_sse2, .Lfunc_lep2-.Lfunc_gep2 + mfocrf 12, 32 + mflr 0 + std 0, 16(1) + stw 12, 8(1) + stdu 1, -256(1) + .cfi_def_cfa_offset 256 + .cfi_offset lr, 16 + .cfi_offset r17, -120 + .cfi_offset r18, -112 + .cfi_offset r19, -104 + .cfi_offset r20, -96 + .cfi_offset r21, -88 + .cfi_offset r22, -80 + .cfi_offset r23, -72 + .cfi_offset r24, -64 + .cfi_offset r25, -56 + .cfi_offset r26, -48 + .cfi_offset r27, -40 + .cfi_offset r28, -32 + .cfi_offset r29, -24 + .cfi_offset r30, -16 + .cfi_offset cr2, 8 + std 26, 208(1) + mr 26, 4 + cmpldi 1, 4, 4 + andi. 4, 8, 1 + std 18, 144(1) + std 19, 152(1) + crmove 8, 1 + ld 19, 360(1) + lwz 18, 352(1) + std 24, 192(1) + std 25, 200(1) + std 27, 216(1) + std 28, 224(1) + mr 24, 10 + mr 28, 6 + mr 27, 5 + mr 25, 3 + std 29, 232(1) + std 30, 240(1) + mr 30, 9 + mr 29, 7 + std 17, 136(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + blt 1, .LBB2_3 + li 3, 0 + li 4, 1 + clrldi 23, 30, 32 + isel 22, 4, 3, 8 + clrldi 21, 24, 32 + clrldi 20, 18, 32 +.LBB2_2: + mr 3, 25 + mr 4, 27 + mr 5, 28 + mr 6, 29 + mr 7, 22 + mr 8, 23 + mr 9, 21 + mr 10, 20 + std 19, 32(1) + bl blake3_hash4_sse2 + addi 26, 26, -4 + addi 3, 29, 4 + addi 25, 25, 32 + addi 19, 19, 128 + cmpldi 26, 3 + isel 29, 3, 29, 8 + bgt 0, .LBB2_2 +.LBB2_3: + cmpldi 26, 0 + beq 0, .LBB2_11 + li 3, 0 + li 4, 1 + or 21, 24, 30 + li 20, 16 + addi 24, 1, 96 + isel 22, 4, 3, 8 +.LBB2_5: + lxvd2x 0, 28, 20 + ld 23, 0(25) + mr 17, 27 + mr 3, 21 + stxvd2x 0, 24, 20 + lxvd2x 0, 0, 28 + stxvd2x 0, 0, 24 +.LBB2_6: + cmpldi 17, 1 + beq 0, .LBB2_8 + cmpldi 17, 0 + bne 0, .LBB2_9 + b .LBB2_10 +.LBB2_8: + or 3, 3, 18 +.LBB2_9: + clrldi 7, 3, 56 + mr 3, 24 + mr 4, 23 + li 5, 64 + mr 6, 29 + bl zfs_blake3_compress_in_place_sse2 + addi 23, 23, 64 + addi 17, 17, -1 + mr 3, 30 + b .LBB2_6 +.LBB2_10: + lxvd2x 0, 24, 20 + addi 26, 26, -1 + add 29, 29, 22 + addi 25, 25, 8 + cmpldi 26, 0 + stxvd2x 0, 19, 20 + lxvd2x 0, 0, 24 + stxvd2x 0, 0, 19 + addi 19, 19, 32 + bne 0, .LBB2_5 +.LBB2_11: + ld 30, 240(1) + ld 29, 232(1) + ld 28, 224(1) + ld 27, 216(1) + ld 26, 208(1) + ld 25, 200(1) + ld 24, 192(1) + ld 23, 184(1) + ld 22, 176(1) + ld 21, 168(1) + ld 20, 160(1) + ld 19, 152(1) + ld 18, 144(1) + ld 17, 136(1) + addi 1, 1, 256 + ld 0, 16(1) + lwz 12, 8(1) + mtocrf 32, 12 + mtlr 0 + blr + .long 0 + .quad 0 +.Lfunc_end2: + .size zfs_blake3_hash_many_sse2, .Lfunc_end2-.Lfunc_begin2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI3_0: + .quad 4294967296 + .quad 12884901890 +.LCPI3_1: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI3_2: + .long 1779033703 + .long 1779033703 + .long 1779033703 + .long 1779033703 +.LCPI3_3: + .long 3144134277 + .long 3144134277 + .long 3144134277 + .long 3144134277 +.LCPI3_4: + .long 1013904242 + .long 1013904242 + .long 1013904242 + .long 1013904242 +.LCPI3_5: + .long 2773480762 + .long 2773480762 + .long 2773480762 + .long 2773480762 + .text + .p2align 2 + .type blake3_hash4_sse2,@function +blake3_hash4_sse2: +.Lfunc_begin3: + .cfi_startproc +.Lfunc_gep3: + addis 2, 12, .TOC.-.Lfunc_gep3@ha + addi 2, 2, .TOC.-.Lfunc_gep3@l +.Lfunc_lep3: + .localentry blake3_hash4_sse2, .Lfunc_lep3-.Lfunc_gep3 + stdu 1, -400(1) + .cfi_def_cfa_offset 400 + .cfi_offset r22, -152 + .cfi_offset r23, -144 + .cfi_offset r24, -136 + .cfi_offset r25, -128 + .cfi_offset r26, -120 + .cfi_offset r27, -112 + .cfi_offset r28, -104 + .cfi_offset r29, -96 + .cfi_offset r30, -88 + .cfi_offset f23, -72 + .cfi_offset f24, -64 + .cfi_offset f25, -56 + .cfi_offset f26, -48 + .cfi_offset f27, -40 + .cfi_offset f28, -32 + .cfi_offset f29, -24 + .cfi_offset f30, -16 + .cfi_offset f31, -8 + .cfi_offset v20, -352 + .cfi_offset v21, -336 + .cfi_offset v22, -320 + .cfi_offset v23, -304 + .cfi_offset v24, -288 + .cfi_offset v25, -272 + .cfi_offset v26, -256 + .cfi_offset v27, -240 + .cfi_offset v28, -224 + .cfi_offset v29, -208 + .cfi_offset v30, -192 + .cfi_offset v31, -176 + li 11, 48 + li 0, 8 + std 30, 312(1) + li 30, 12 + li 12, 4 + lfiwzx 0, 0, 5 + stxvd2x 52, 1, 11 + li 11, 64 + lfiwzx 2, 5, 0 + li 0, 20 + lfiwzx 3, 5, 30 + stxvd2x 53, 1, 11 + li 11, 80 + li 30, 24 + lfiwzx 4, 5, 0 + li 0, 28 + stxvd2x 54, 1, 11 + li 11, 96 + lfiwzx 1, 5, 12 + lfiwzx 6, 5, 30 + xxspltw 45, 0, 1 + cmpldi 4, 0 + std 22, 248(1) + stxvd2x 55, 1, 11 + li 11, 112 + lfiwzx 7, 5, 0 + xxspltw 40, 2, 1 + std 23, 256(1) + xxspltw 38, 3, 1 + xxspltw 50, 4, 1 + std 24, 264(1) + std 25, 272(1) + std 26, 280(1) + xxspltw 54, 7, 1 + std 27, 288(1) + std 28, 296(1) + std 29, 304(1) + stxvd2x 56, 1, 11 + li 11, 128 + stfd 23, 328(1) + stxvd2x 57, 1, 11 + li 11, 144 + stfd 24, 336(1) + stxvd2x 58, 1, 11 + li 11, 160 + stfd 25, 344(1) + stxvd2x 59, 1, 11 + li 11, 176 + xxspltw 59, 1, 1 + stxvd2x 60, 1, 11 + li 11, 192 + stfd 26, 352(1) + stxvd2x 61, 1, 11 + li 11, 208 + stfd 27, 360(1) + stxvd2x 62, 1, 11 + li 11, 224 + xxspltw 62, 6, 1 + stxvd2x 63, 1, 11 + li 11, 16 + stfd 28, 368(1) + lfiwzx 5, 5, 11 + ld 5, 432(1) + stfd 29, 376(1) + stfd 30, 384(1) + stfd 31, 392(1) + xxspltw 61, 5, 1 + beq 0, .LBB3_5 + addis 30, 2, .LCPI3_0@toc@ha + neg 7, 7 + xxleqv 34, 34, 34 + addis 28, 2, .LCPI3_2@toc@ha + addis 27, 2, .LCPI3_3@toc@ha + addis 26, 2, .LCPI3_4@toc@ha + addis 25, 2, .LCPI3_5@toc@ha + ld 29, 24(3) + addi 0, 30, .LCPI3_0@toc@l + mtfprwz 1, 7 + addis 7, 2, .LCPI3_1@toc@ha + ld 30, 16(3) + lxvd2x 0, 0, 0 + mtfprwz 2, 6 + rldicl 6, 6, 32, 32 + addi 0, 7, .LCPI3_1@toc@l + ld 7, 8(3) + vslw 2, 2, 2 + lvx 5, 0, 0 + addi 0, 28, .LCPI3_2@toc@l + addi 28, 27, .LCPI3_3@toc@l + addi 27, 26, .LCPI3_4@toc@l + addi 26, 25, .LCPI3_5@toc@l + or 25, 9, 8 + li 9, 0 + xxspltw 36, 2, 1 + xxswapd 35, 0 + xxspltw 0, 1, 1 + xxland 35, 0, 35 + mtfprwz 0, 6 + ld 6, 0(3) + addi 3, 3, -8 + vadduwm 4, 3, 4 + xxlor 35, 35, 34 + xxlxor 34, 36, 34 + xxlor 9, 36, 36 + vspltisw 4, 4 + vcmpgtsw 2, 3, 2 + xxspltw 35, 0, 1 + xxlor 10, 36, 36 + vsubuwm 2, 3, 2 + xxlor 11, 34, 34 + lvx 2, 0, 0 + li 0, 32 + xxlor 12, 34, 34 + lvx 2, 0, 28 + li 28, 48 + xxlor 13, 34, 34 + lvx 2, 0, 27 + li 27, 0 + xxlor 31, 34, 34 + lvx 2, 0, 26 + xxlor 30, 34, 34 +.LBB3_2: + mr 26, 27 + addi 27, 27, 1 + xxlor 28, 40, 40 + cmpld 27, 4 + sldi 26, 26, 6 + xxlor 24, 45, 45 + iseleq 24, 10, 9 + add 23, 6, 26 + add 22, 30, 26 + lxvd2x 0, 6, 26 + lxvd2x 1, 7, 26 + or 25, 24, 25 + add 24, 7, 26 + lxvd2x 2, 30, 26 + lxvd2x 3, 29, 26 + xxlor 29, 38, 38 + lxvd2x 4, 23, 11 + lxvd2x 6, 24, 11 + clrlwi 25, 25, 24 + lxvd2x 7, 22, 11 + lxvd2x 8, 23, 0 + mtfprd 5, 25 + add 25, 29, 26 + xxswapd 34, 0 + lxvd2x 0, 25, 11 + xxswapd 36, 1 + xxswapd 33, 2 + lxvd2x 1, 24, 0 + lxvd2x 2, 22, 0 + xxswapd 39, 3 + xxswapd 32, 4 + lxvd2x 3, 25, 0 + lxvd2x 4, 23, 28 + xxswapd 49, 6 + xxswapd 51, 7 + lxvd2x 6, 24, 28 + xxswapd 58, 8 + lxvd2x 7, 22, 28 + lxvd2x 8, 25, 28 + xxswapd 60, 0 + mr 25, 3 + xxswapd 57, 1 + xxswapd 53, 2 + xxswapd 52, 3 + xxswapd 56, 4 + xxswapd 55, 6 + xxswapd 0, 5 + xxswapd 40, 7 + xxswapd 41, 8 + mtctr 12 +.LBB3_3: + ldu 24, 8(25) + add 24, 24, 26 + addi 24, 24, 256 + dcbt 0, 24 + bdnz .LBB3_3 + vmrgew 3, 4, 2 + vspltisw 31, 9 + mr 25, 8 + vmrglw 10, 4, 2 + vspltisw 14, 10 + vmrghw 6, 4, 2 + xxspltw 0, 0, 3 + vmrgew 4, 17, 0 + vmrglw 11, 17, 0 + vmrghw 16, 17, 0 + vmrgew 0, 25, 26 + vmrgew 13, 7, 1 + vmrglw 2, 7, 1 + vmrghw 7, 7, 1 + xxlor 25, 36, 36 + vmrgew 4, 28, 19 + xxlor 26, 32, 32 + vmrglw 0, 25, 26 + vmrglw 1, 28, 19 + xxmrgld 47, 34, 42 + xxlor 44, 28, 28 + vmrghw 25, 25, 26 + xxlor 23, 36, 36 + vmrghw 4, 28, 19 + vspltisw 19, -16 + xxlor 5, 32, 32 + vmrgew 0, 20, 21 + xxmrgld 34, 33, 43 + vmrglw 28, 20, 21 + vmrghw 21, 20, 21 + vmrglw 20, 23, 24 + vmrghw 26, 23, 24 + vmrglw 17, 9, 8 + xxlor 8, 32, 32 + vmrgew 0, 23, 24 + xxmrgld 56, 39, 38 + vmrgew 23, 9, 8 + xxlor 33, 24, 24 + xxlor 2, 34, 34 + vadduwm 11, 15, 1 + xxmrgld 33, 36, 48 + xxlor 6, 47, 47 + xxlor 27, 32, 32 + vmrghw 0, 9, 8 + vspltisw 9, 12 + vsubuwm 8, 31, 19 + xxmrgld 51, 23, 25 + vadduwm 31, 2, 12 + xxlor 34, 10, 10 + vadduwm 10, 14, 14 + vslw 15, 2, 2 + xxlor 34, 29, 29 + vadduwm 14, 24, 27 + xxlor 24, 48, 48 + vadduwm 16, 1, 2 + xxmrgld 34, 45, 35 + vadduwm 31, 31, 30 + xxmrghd 36, 36, 24 + vadduwm 11, 11, 29 + vadduwm 14, 14, 18 + vadduwm 13, 16, 22 + xxlxor 47, 63, 47 + xxlor 1, 9, 9 + xxlor 1, 11, 11 + xxlxor 48, 43, 9 + vadduwm 11, 11, 2 + xxlor 7, 34, 34 + xxmrghd 34, 39, 38 + xxlxor 39, 46, 11 + xxlor 1, 50, 50 + xxlxor 50, 45, 0 + vperm 15, 15, 15, 5 + vperm 16, 16, 16, 5 + vperm 7, 7, 7, 5 + vperm 18, 18, 18, 5 + xxlor 4, 33, 33 + xxlor 33, 31, 31 + vadduwm 14, 14, 2 + xxlor 3, 34, 34 + xxlor 34, 12, 12 + xxlor 35, 13, 13 + vadduwm 6, 15, 1 + xxlor 33, 30, 30 + vadduwm 2, 16, 2 + vadduwm 3, 7, 3 + vadduwm 12, 18, 1 + xxlxor 59, 34, 61 + xxlxor 61, 35, 1 + xxlxor 33, 38, 62 + xxlxor 62, 44, 54 + vrlw 22, 27, 10 + vrlw 29, 29, 10 + vrlw 1, 1, 10 + vrlw 30, 30, 10 + vadduwm 31, 31, 19 + vadduwm 13, 13, 4 + vadduwm 11, 22, 11 + vadduwm 14, 29, 14 + vadduwm 31, 1, 31 + vadduwm 13, 30, 13 + vadduwm 9, 9, 9 + xxlor 1, 36, 36 + xxlxor 48, 43, 48 + xxlxor 36, 46, 39 + xxmrgld 39, 60, 5 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 28, 4, 9 + xxmrgld 36, 53, 57 + vrlw 15, 15, 9 + xxmrghd 57, 53, 57 + vrlw 18, 18, 9 + vadduwm 14, 14, 4 + xxlor 0, 36, 36 + xxmrgld 36, 49, 52 + vadduwm 2, 16, 2 + xxmrgld 49, 8, 26 + vadduwm 3, 28, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 54, 34, 54 + xxlxor 61, 35, 61 + xxlxor 33, 38, 33 + xxlxor 62, 44, 62 + vrlw 29, 29, 8 + vrlw 20, 1, 8 + xxmrgld 33, 55, 27 + vrlw 30, 30, 8 + vrlw 22, 22, 8 + vadduwm 11, 11, 7 + xxlor 5, 39, 39 + xxmrgld 39, 32, 58 + vadduwm 31, 31, 4 + vadduwm 11, 29, 11 + vadduwm 13, 13, 7 + vadduwm 14, 20, 14 + vadduwm 31, 30, 31 + vadduwm 13, 22, 13 + xxlor 28, 36, 36 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 60 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vadduwm 11, 11, 17 + vmr 28, 17 + xxmrghd 49, 32, 58 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 21, 4, 2 + vadduwm 3, 15, 3 + xxlxor 34, 38, 61 + xxlxor 61, 44, 52 + xxlxor 62, 53, 62 + xxlxor 54, 35, 54 + vrlw 20, 2, 10 + vrlw 29, 29, 10 + vrlw 0, 30, 10 + vrlw 30, 22, 10 + vadduwm 14, 14, 25 + vadduwm 31, 31, 1 + vadduwm 13, 13, 17 + vadduwm 11, 20, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 24 + xxlor 8, 56, 56 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 21 + vadduwm 3, 15, 3 + xxlxor 55, 38, 52 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 25, 51, 51 + vmr 26, 17 + xxlor 49, 3, 3 + xxlor 52, 1, 1 + xxlor 51, 2, 2 + vadduwm 14, 14, 17 + vadduwm 31, 31, 20 + vadduwm 13, 13, 19 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 29, 39, 39 + xxlor 59, 4, 4 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 53, 0, 0 + xxlor 39, 6, 6 + vadduwm 11, 11, 27 + vadduwm 14, 14, 21 + vadduwm 31, 31, 7 + vadduwm 13, 13, 1 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 34, 7, 7 + vadduwm 31, 31, 28 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 11, 11, 2 + xxlor 34, 28, 28 + vadduwm 13, 13, 26 + vadduwm 14, 14, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 2, 58, 58 + xxlor 39, 25, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 54, 29, 29 + xxlor 58, 5, 5 + vadduwm 11, 11, 25 + vadduwm 14, 14, 7 + vadduwm 31, 31, 22 + vadduwm 13, 13, 26 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 17 + vadduwm 14, 14, 21 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 31, 31, 1 + vadduwm 13, 13, 20 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 0, 33, 33 + xxlor 33, 8, 8 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vadduwm 11, 11, 19 + vadduwm 14, 14, 2 + vadduwm 31, 31, 1 + vadduwm 13, 13, 22 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + vadduwm 11, 11, 27 + vadduwm 14, 14, 28 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 31, 31, 25 + vadduwm 13, 13, 26 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 3, 7, 7 + vadduwm 11, 11, 7 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 33, 6, 6 + xxlor 58, 2, 2 + xxlor 39, 3, 3 + vadduwm 14, 14, 1 + vadduwm 31, 31, 26 + vadduwm 13, 13, 7 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + xxlor 52, 0, 0 + vadduwm 11, 11, 21 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 14, 14, 2 + vadduwm 31, 31, 22 + vadduwm 13, 13, 20 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 7, 49, 49 + vmr 17, 2 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 54, 1, 1 + xxlor 34, 7, 7 + vadduwm 11, 11, 22 + vadduwm 14, 14, 28 + vadduwm 31, 31, 2 + vadduwm 13, 13, 26 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 59, 25, 25 + vadduwm 11, 11, 19 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 14, 14, 25 + vadduwm 31, 31, 27 + vadduwm 13, 13, 7 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vmr 2, 19 + xxlor 0, 7, 7 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 1, 51, 51 + xxlor 7, 39, 39 + xxlor 51, 8, 8 + xxlor 39, 5, 5 + xxlor 34, 4, 4 + vadduwm 11, 11, 1 + vadduwm 14, 14, 19 + vadduwm 31, 31, 7 + vadduwm 13, 13, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + xxlor 2, 53, 53 + vmr 21, 28 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 53, 29, 29 + vadduwm 11, 11, 17 + vadduwm 14, 14, 28 + vadduwm 31, 31, 26 + vadduwm 13, 13, 21 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + vadduwm 11, 11, 20 + xxlor 5, 52, 52 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 52, 2, 2 + vadduwm 14, 14, 25 + vadduwm 31, 31, 20 + vadduwm 13, 13, 7 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + vadduwm 11, 11, 22 + vadduwm 14, 14, 27 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 31, 31, 1 + vadduwm 13, 13, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 3, 29, 29 + xxlor 4, 49, 49 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + vmr 17, 28 + xxlor 2, 54, 54 + xxlor 3, 34, 34 + xxlor 34, 8, 8 + xxlor 51, 0, 0 + xxlor 60, 7, 7 + xxlor 54, 1, 1 + vadduwm 11, 11, 2 + vadduwm 14, 14, 19 + vadduwm 31, 31, 28 + vadduwm 13, 13, 22 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 17 + vadduwm 14, 14, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 31, 31, 7 + vadduwm 13, 13, 26 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 6, 39, 39 + xxlor 39, 4, 4 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vadduwm 11, 11, 21 + vadduwm 14, 14, 27 + vadduwm 31, 31, 7 + vadduwm 13, 13, 28 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 0, 49, 49 + xxlor 49, 5, 5 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 11, 11, 17 + vadduwm 14, 14, 1 + vadduwm 31, 31, 2 + vadduwm 13, 13, 22 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 34, 3, 3 + xxlor 49, 2, 2 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + vadduwm 11, 11, 19 + vadduwm 14, 14, 20 + vadduwm 31, 31, 2 + vadduwm 13, 13, 17 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 14, 14, 27 + vadduwm 11, 11, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 27, 4, 24 + vadduwm 3, 15, 3 + xxlxor 57, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 59, 32 + xxlor 39, 7, 7 + vrlw 30, 30, 8 + vrlw 25, 25, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 1, 58, 58 + vmr 26, 19 + vadduwm 19, 31, 7 + xxlor 39, 6, 6 + vadduwm 11, 30, 11 + vadduwm 7, 13, 7 + vadduwm 13, 25, 14 + vadduwm 14, 29, 19 + vadduwm 7, 0, 7 + xxlxor 48, 43, 48 + xxlxor 36, 45, 36 + xxlxor 47, 46, 47 + xxlxor 50, 39, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 51, 1, 1 + vadduwm 13, 13, 1 + vadduwm 11, 11, 19 + vadduwm 19, 16, 27 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 63, 51, 62 + xxlxor 62, 35, 57 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 31, 31, 10 + vrlw 30, 30, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 33, 0, 0 + vadduwm 7, 7, 2 + vadduwm 14, 14, 1 + vadduwm 11, 31, 11 + vadduwm 13, 30, 13 + vadduwm 14, 29, 14 + vadduwm 7, 0, 7 + xxlxor 48, 43, 48 + xxlxor 36, 45, 36 + xxlxor 47, 46, 47 + xxlxor 50, 39, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 60, 8, 8 + vadduwm 1, 11, 21 + vadduwm 11, 13, 28 + vadduwm 13, 16, 19 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 51, 45, 63 + xxlxor 63, 35, 62 + xxlxor 62, 38, 61 + xxlxor 32, 44, 32 + vrlw 31, 31, 8 + vrlw 30, 30, 8 + vrlw 0, 0, 8 + vrlw 19, 19, 8 + vadduwm 14, 14, 26 + vadduwm 7, 7, 17 + vadduwm 1, 31, 1 + vadduwm 11, 30, 11 + vadduwm 14, 0, 14 + vadduwm 7, 19, 7 + xxlxor 50, 33, 50 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 39, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 34, 4, 4 + vadduwm 14, 14, 22 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 13, 4, 13 + vadduwm 3, 15, 3 + xxlxor 49, 38, 63 + xxlxor 63, 44, 62 + xxlxor 32, 45, 32 + xxlxor 51, 35, 51 + vrlw 17, 17, 10 + vrlw 31, 31, 10 + vrlw 0, 0, 10 + vrlw 10, 19, 10 + vadduwm 11, 11, 2 + xxlor 34, 5, 5 + vadduwm 1, 1, 20 + vadduwm 2, 7, 2 + vadduwm 7, 31, 11 + vadduwm 11, 0, 14 + vadduwm 2, 10, 2 + vadduwm 1, 17, 1 + xxlxor 36, 43, 36 + xxlxor 46, 34, 47 + vrlw 4, 4, 9 + vrlw 14, 14, 9 + xxlxor 47, 33, 50 + xxlxor 48, 39, 48 + vrlw 15, 15, 9 + vrlw 9, 16, 9 + vadduwm 13, 4, 13 + vadduwm 3, 14, 3 + xxlxor 32, 45, 32 + xxlxor 45, 45, 33 + xxlxor 33, 35, 42 + xxlxor 59, 35, 39 + vadduwm 3, 15, 6 + vadduwm 6, 9, 12 + xxlxor 39, 35, 49 + xxlxor 42, 38, 63 + vrlw 1, 1, 8 + vrlw 7, 7, 8 + vrlw 10, 10, 8 + vrlw 0, 0, 8 + xxlxor 40, 35, 43 + xxlxor 38, 38, 34 + xxlxor 61, 33, 41 + xxlxor 50, 39, 36 + xxlxor 62, 42, 46 + xxlxor 54, 32, 47 + bne 0, .LBB3_2 +.LBB3_5: + vmrglw 2, 27, 13 + li 3, 32 + li 4, 48 + vmrglw 4, 6, 8 + vmrglw 0, 18, 29 + vmrglw 1, 22, 30 + vmrghw 3, 27, 13 + vmrghw 5, 6, 8 + vmrghw 6, 18, 29 + vmrghw 7, 22, 30 + xxmrgld 40, 36, 34 + xxmrghd 34, 36, 34 + xxmrgld 41, 33, 32 + xxswapd 0, 40 + xxmrgld 36, 37, 35 + xxmrghd 35, 37, 35 + xxmrghd 37, 33, 32 + xxswapd 1, 41 + xxmrgld 32, 39, 38 + xxmrghd 33, 39, 38 + xxswapd 2, 34 + xxswapd 4, 36 + xxswapd 3, 37 + stxvd2x 0, 0, 5 + xxswapd 5, 32 + stxvd2x 1, 5, 11 + xxswapd 0, 35 + xxswapd 1, 33 + stxvd2x 2, 5, 3 + li 3, 64 + stxvd2x 3, 5, 4 + li 4, 80 + stxvd2x 4, 5, 3 + li 3, 96 + stxvd2x 5, 5, 4 + li 4, 112 + stxvd2x 0, 5, 3 + stxvd2x 1, 5, 4 + li 3, 224 + lxvd2x 63, 1, 3 + li 3, 208 + lfd 31, 392(1) + ld 30, 312(1) + ld 29, 304(1) + lxvd2x 62, 1, 3 + li 3, 192 + lfd 30, 384(1) + ld 28, 296(1) + ld 27, 288(1) + lxvd2x 61, 1, 3 + li 3, 176 + lfd 29, 376(1) + ld 26, 280(1) + ld 25, 272(1) + lxvd2x 60, 1, 3 + li 3, 160 + lfd 28, 368(1) + ld 24, 264(1) + ld 23, 256(1) + lxvd2x 59, 1, 3 + li 3, 144 + lfd 27, 360(1) + ld 22, 248(1) + lxvd2x 58, 1, 3 + li 3, 128 + lfd 26, 352(1) + lxvd2x 57, 1, 3 + li 3, 112 + lfd 25, 344(1) + lxvd2x 56, 1, 3 + li 3, 96 + lfd 24, 336(1) + lxvd2x 55, 1, 3 + li 3, 80 + lfd 23, 328(1) + lxvd2x 54, 1, 3 + li 3, 64 + lxvd2x 53, 1, 3 + li 3, 48 + lxvd2x 52, 1, 3 + addi 1, 1, 400 + blr + .long 0 + .quad 0 +.Lfunc_end3: + .size blake3_hash4_sse2, .Lfunc_end3-.Lfunc_begin3 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S new file mode 100644 index 000000000..a8b2627f1 --- /dev/null +++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S @@ -0,0 +1,3064 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + * + * This is converted assembly: SSE4.1 -> POWER8 PPC64 Little Endian + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + .text + .abiversion 2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 31 + .byte 14 + .byte 13 + .byte 12 + .byte 30 + .byte 10 + .byte 9 + .byte 8 + .byte 29 + .byte 6 + .byte 5 + .byte 4 + .byte 28 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI0_2: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI0_3: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI0_4: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_5: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI0_6: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 +.LCPI0_7: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_8: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_9: + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_10: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 31 + .byte 31 + .byte 31 +.LCPI0_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_12: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_13: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_14: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: +.Lfunc_begin0: + .cfi_startproc +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry zfs_blake3_compress_in_place_sse41, .Lfunc_lep0-.Lfunc_gep0 + li 8, -64 + mtvsrd 34, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 13, -16 + stxvd2x 60, 1, 8 + li 8, -48 + mtvsrd 35, 7 + lfd 2, 16(4) + lfd 3, 24(4) + addis 7, 2, .LCPI0_0@toc@ha + stxvd2x 61, 1, 8 + li 8, -32 + mtvsrwz 36, 6 + rldicl 6, 6, 32, 32 + stxvd2x 62, 1, 8 + li 8, -16 + vmrghb 2, 3, 2 + stxvd2x 63, 1, 8 + mtvsrwz 35, 6 + addi 6, 7, .LCPI0_0@toc@l + addis 7, 2, .LCPI0_2@toc@ha + lfd 1, 8(4) + xxmrghd 32, 3, 2 + lvx 6, 0, 6 + xxlxor 33, 33, 33 + addis 6, 2, .LCPI0_1@toc@ha + addi 7, 7, .LCPI0_2@toc@l + vmrghw 3, 3, 4 + addi 6, 6, .LCPI0_1@toc@l + vspltisw 14, 9 + xxmrghd 37, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + vperm 2, 1, 2, 6 + vpkudum 9, 0, 5 + xxswapd 36, 0 + xxswapd 38, 1 + xxmrgld 34, 34, 35 + lvx 3, 0, 7 + addis 7, 2, .LCPI0_4@toc@ha + addi 7, 7, .LCPI0_4@toc@l + vadduwm 4, 9, 4 + lvx 11, 0, 7 + addis 7, 2, .LCPI0_6@toc@ha + addi 7, 7, .LCPI0_6@toc@l + vadduwm 7, 4, 6 + lvx 4, 0, 6 + addis 6, 2, .LCPI0_3@toc@ha + addi 6, 6, .LCPI0_3@toc@l + vperm 11, 0, 5, 11 + lvx 0, 0, 7 + li 7, 48 + xxlxor 40, 39, 34 + lvx 10, 0, 6 + addis 6, 2, .LCPI0_5@toc@ha + lxvd2x 1, 4, 7 + vcmpgtsb 2, 1, 4 + addi 6, 6, .LCPI0_5@toc@l + vperm 4, 8, 8, 3 + vspltisw 8, 10 + xxlandc 44, 36, 34 + vadduwm 4, 8, 8 + vadduwm 8, 12, 10 + xxlxor 37, 40, 38 + vrlw 6, 5, 4 + vadduwm 5, 7, 11 + vadduwm 7, 6, 5 + lvx 5, 0, 6 + li 6, 32 + lxvd2x 0, 4, 6 + addis 4, 2, .LCPI0_7@toc@ha + addis 6, 2, .LCPI0_9@toc@ha + xxlxor 42, 39, 44 + xxswapd 44, 1 + addi 4, 4, .LCPI0_7@toc@l + addi 6, 6, .LCPI0_9@toc@l + vcmpgtsb 5, 1, 5 + vperm 1, 10, 10, 0 + xxswapd 42, 0 + vpkudum 16, 12, 10 + xxlandc 47, 33, 37 + vsubuwm 1, 14, 13 + lvx 14, 0, 4 + addis 4, 2, .LCPI0_8@toc@ha + vadduwm 8, 15, 8 + xxswapd 45, 47 + addi 4, 4, .LCPI0_8@toc@l + vadduwm 7, 7, 16 + xxsldwi 48, 48, 48, 1 + xxlxor 38, 40, 38 + xxsldwi 40, 40, 40, 3 + xxsldwi 39, 39, 39, 1 + vperm 14, 10, 12, 14 + vrlw 6, 6, 1 + vadduwm 7, 6, 7 + xxlxor 45, 39, 45 + vperm 13, 13, 13, 3 + xxlandc 45, 45, 34 + vadduwm 8, 13, 8 + xxlxor 38, 40, 38 + vrlw 10, 6, 4 + vadduwm 6, 7, 14 + vadduwm 7, 10, 6 + xxlxor 38, 39, 45 + vperm 12, 6, 6, 0 + lvx 6, 0, 4 + addis 4, 2, .LCPI0_10@toc@ha + addi 4, 4, .LCPI0_10@toc@l + vperm 13, 11, 9, 6 + xxlandc 44, 44, 37 + vadduwm 15, 12, 8 + vadduwm 7, 7, 13 + xxsldwi 45, 45, 45, 3 + xxlxor 40, 47, 42 + xxsldwi 47, 47, 47, 1 + xxsldwi 39, 39, 39, 3 + vrlw 10, 8, 1 + xxswapd 40, 44 + vadduwm 17, 10, 7 + lvx 7, 0, 4 + addis 4, 2, .LCPI0_11@toc@ha + addi 4, 4, .LCPI0_11@toc@l + xxlxor 44, 49, 40 + lvx 8, 0, 6 + vperm 18, 9, 9, 7 + lvx 9, 0, 4 + addis 4, 2, .LCPI0_12@toc@ha + vperm 12, 12, 12, 3 + addi 4, 4, .LCPI0_12@toc@l + vperm 19, 14, 16, 8 + xxlandc 63, 44, 34 + vperm 12, 19, 18, 9 + vadduwm 15, 31, 15 + xxlxor 42, 47, 42 + vrlw 18, 10, 4 + vadduwm 10, 17, 12 + vadduwm 17, 18, 10 + xxlxor 42, 49, 63 + xxmrgld 63, 43, 46 + xxsldwi 49, 49, 49, 1 + vmrghw 14, 14, 11 + vperm 19, 10, 10, 0 + lvx 10, 0, 4 + addis 4, 2, .LCPI0_13@toc@ha + addi 4, 4, .LCPI0_13@toc@l + lvx 11, 0, 4 + addis 4, 2, .LCPI0_14@toc@ha + vperm 31, 16, 31, 10 + addi 4, 4, .LCPI0_14@toc@l + vperm 14, 14, 16, 11 + xxlandc 51, 51, 37 + vadduwm 15, 19, 15 + xxswapd 51, 51 + vadduwm 17, 17, 31 + xxlxor 50, 47, 50 + xxsldwi 47, 47, 47, 3 + vperm 30, 14, 31, 8 + vrlw 18, 18, 1 + vadduwm 17, 18, 17 + xxlxor 51, 49, 51 + vadduwm 17, 17, 14 + vperm 19, 19, 19, 3 + xxlandc 51, 51, 34 + vadduwm 15, 19, 15 + xxlxor 48, 47, 50 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 51 + vperm 19, 12, 13, 6 + vperm 18, 18, 18, 0 + vperm 13, 13, 13, 7 + vadduwm 17, 17, 19 + xxlandc 50, 50, 37 + xxsldwi 49, 49, 49, 3 + vperm 13, 30, 13, 9 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxmrgld 62, 44, 46 + vmrghw 12, 14, 12 + xxlxor 48, 47, 48 + xxsldwi 47, 47, 47, 1 + vrlw 16, 16, 1 + vperm 30, 31, 30, 10 + vperm 12, 12, 31, 11 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 13 + vperm 18, 18, 18, 3 + vperm 31, 12, 30, 8 + xxlandc 50, 50, 34 + vadduwm 15, 18, 15 + xxlxor 48, 47, 48 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + xxsldwi 49, 49, 49, 1 + vperm 18, 18, 18, 0 + vadduwm 17, 17, 30 + xxlandc 50, 50, 37 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxlxor 48, 47, 48 + xxsldwi 46, 47, 47, 3 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 12 + vperm 18, 18, 18, 3 + xxlandc 47, 50, 34 + xxsldwi 50, 51, 51, 3 + vadduwm 14, 15, 14 + vperm 19, 13, 18, 6 + xxlxor 48, 46, 48 + vperm 18, 18, 18, 7 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vadduwm 17, 17, 19 + vperm 15, 15, 15, 0 + xxsldwi 49, 49, 49, 3 + xxlandc 47, 47, 37 + vadduwm 14, 15, 14 + xxswapd 47, 47 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 29, 15, 14 + vperm 14, 31, 18, 9 + xxmrgld 50, 45, 44 + xxlxor 48, 61, 48 + vmrghw 12, 12, 13 + vrlw 16, 16, 4 + vperm 18, 30, 18, 10 + vadduwm 17, 17, 14 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + xxsldwi 49, 49, 49, 1 + vperm 15, 15, 15, 0 + vadduwm 17, 17, 18 + xxlandc 47, 47, 37 + vadduwm 31, 15, 29 + xxswapd 47, 47 + xxlxor 48, 63, 48 + xxsldwi 45, 63, 63, 3 + vperm 31, 12, 30, 11 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 13, 15, 13 + xxlxor 44, 45, 48 + vadduwm 16, 17, 31 + xxsldwi 49, 51, 51, 3 + vrlw 12, 12, 4 + vperm 19, 14, 17, 6 + vadduwm 16, 12, 16 + xxlxor 47, 48, 47 + vperm 15, 15, 15, 0 + xxlandc 47, 47, 37 + vadduwm 13, 15, 13 + xxswapd 47, 47 + xxlxor 44, 45, 44 + xxsldwi 45, 45, 45, 1 + vrlw 30, 12, 1 + vadduwm 12, 16, 19 + xxsldwi 44, 44, 44, 3 + vadduwm 16, 30, 12 + xxlxor 44, 48, 47 + vperm 15, 17, 17, 7 + vperm 12, 12, 12, 3 + vperm 17, 31, 18, 8 + xxlandc 61, 44, 34 + vperm 12, 17, 15, 9 + vadduwm 13, 29, 13 + xxlxor 47, 45, 62 + xxmrgld 62, 46, 63 + vmrghw 14, 31, 14 + vrlw 15, 15, 4 + vadduwm 16, 16, 12 + vperm 30, 18, 30, 10 + vperm 14, 14, 18, 11 + xxsldwi 50, 51, 51, 3 + vadduwm 16, 15, 16 + xxlxor 49, 48, 61 + xxsldwi 48, 48, 48, 1 + vperm 19, 12, 18, 6 + vperm 17, 17, 17, 0 + vadduwm 16, 16, 30 + xxmrgld 60, 44, 46 + vmrghw 12, 14, 12 + vperm 28, 30, 28, 10 + xxlandc 49, 49, 37 + vadduwm 13, 17, 13 + xxswapd 49, 49 + vperm 12, 12, 30, 11 + xxlxor 47, 45, 47 + xxsldwi 45, 45, 45, 3 + vrlw 15, 15, 1 + vperm 8, 12, 28, 8 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vadduwm 16, 16, 14 + vperm 17, 17, 17, 3 + xxlandc 49, 49, 34 + vadduwm 13, 17, 13 + xxlxor 47, 45, 47 + vrlw 15, 15, 4 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vperm 17, 17, 17, 0 + xxlandc 49, 49, 37 + vadduwm 31, 17, 13 + xxlxor 45, 63, 47 + vrlw 15, 13, 1 + vadduwm 13, 16, 19 + xxswapd 48, 49 + xxsldwi 51, 51, 51, 3 + xxsldwi 45, 45, 45, 3 + vadduwm 17, 15, 13 + xxlxor 45, 49, 48 + lvx 16, 0, 4 + vperm 29, 13, 13, 3 + vperm 13, 18, 18, 7 + xxsldwi 50, 63, 63, 1 + vperm 16, 14, 30, 16 + vperm 7, 19, 19, 7 + xxlandc 63, 61, 34 + vadduwm 18, 31, 18 + vperm 29, 16, 13, 9 + xxlxor 47, 50, 47 + vperm 6, 16, 19, 6 + vrlw 15, 15, 4 + vperm 7, 8, 7, 9 + vadduwm 17, 17, 29 + xxmrgld 41, 61, 44 + vadduwm 17, 15, 17 + vperm 9, 28, 9, 10 + xxlxor 63, 49, 63 + xxsldwi 49, 49, 49, 1 + vperm 31, 31, 31, 0 + vadduwm 17, 17, 28 + xxlandc 63, 63, 37 + vadduwm 18, 31, 18 + xxswapd 63, 63 + xxlxor 47, 50, 47 + xxsldwi 46, 50, 50, 3 + vrlw 15, 15, 1 + vadduwm 17, 15, 17 + xxlxor 63, 49, 63 + vadduwm 17, 17, 12 + vperm 31, 31, 31, 3 + xxlandc 50, 63, 34 + vadduwm 14, 18, 14 + xxlxor 47, 46, 47 + vrlw 15, 15, 4 + vadduwm 17, 15, 17 + xxlxor 50, 49, 50 + vadduwm 6, 17, 6 + vperm 18, 18, 18, 0 + xxsldwi 38, 38, 38, 3 + xxlandc 50, 50, 37 + vadduwm 14, 18, 14 + xxswapd 48, 50 + xxlxor 47, 46, 47 + xxsldwi 46, 46, 46, 1 + vrlw 15, 15, 1 + vadduwm 6, 15, 6 + xxlxor 48, 38, 48 + vadduwm 6, 6, 7 + vperm 16, 16, 16, 3 + xxlandc 48, 48, 34 + vadduwm 14, 16, 14 + xxlxor 40, 46, 47 + vrlw 8, 8, 4 + vadduwm 6, 8, 6 + xxlxor 39, 38, 48 + xxsldwi 38, 38, 38, 1 + vperm 7, 7, 7, 0 + vadduwm 6, 6, 9 + xxlandc 39, 39, 37 + vadduwm 14, 7, 14 + xxswapd 39, 39 + xxlxor 40, 46, 40 + xxsldwi 41, 46, 46, 3 + vrlw 8, 8, 1 + vadduwm 6, 8, 6 + xxlxor 39, 38, 39 + vperm 3, 7, 7, 3 + vmrghw 7, 12, 13 + xxlandc 34, 35, 34 + vperm 7, 7, 28, 11 + vadduwm 3, 2, 9 + xxlxor 40, 35, 40 + vrlw 4, 8, 4 + vadduwm 6, 6, 7 + vadduwm 6, 4, 6 + xxlxor 34, 38, 34 + xxsldwi 0, 38, 38, 3 + vperm 2, 2, 2, 0 + xxlandc 34, 34, 37 + vadduwm 3, 2, 3 + xxswapd 34, 34 + xxlxor 36, 35, 36 + xxsldwi 1, 35, 35, 1 + vrlw 4, 4, 1 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 1, 36, 34 + stxvd2x 0, 0, 3 + xxswapd 1, 1 + stxvd2x 1, 3, 5 + li 3, -16 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-.Lfunc_begin0 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 31 + .byte 14 + .byte 13 + .byte 12 + .byte 30 + .byte 10 + .byte 9 + .byte 8 + .byte 29 + .byte 6 + .byte 5 + .byte 4 + .byte 28 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI1_2: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI1_3: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI1_4: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_5: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI1_6: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 +.LCPI1_7: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_8: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_9: + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_10: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 31 + .byte 31 + .byte 31 +.LCPI1_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_12: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_13: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_14: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .text + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: +.Lfunc_begin1: + .cfi_startproc +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry zfs_blake3_compress_xof_sse41, .Lfunc_lep1-.Lfunc_gep1 + li 9, -64 + mtvsrd 34, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 13, -16 + addis 11, 2, .LCPI1_9@toc@ha + stxvd2x 60, 1, 9 + li 9, -48 + mtvsrd 35, 7 + lfd 1, 8(4) + lfd 2, 16(4) + addis 7, 2, .LCPI1_0@toc@ha + stxvd2x 61, 1, 9 + li 9, -32 + mtvsrwz 36, 6 + rldicl 6, 6, 32, 32 + stxvd2x 62, 1, 9 + li 9, -16 + vmrghb 2, 3, 2 + stxvd2x 63, 1, 9 + mtvsrwz 35, 6 + addi 6, 7, .LCPI1_0@toc@l + addis 7, 2, .LCPI1_2@toc@ha + lfd 3, 24(4) + xxmrghd 37, 1, 0 + lvx 6, 0, 6 + xxlxor 33, 33, 33 + lxvd2x 0, 0, 3 + addis 6, 2, .LCPI1_1@toc@ha + addi 7, 7, .LCPI1_2@toc@l + vmrghw 3, 3, 4 + lxvd2x 1, 3, 5 + addi 6, 6, .LCPI1_1@toc@l + vspltisw 14, 9 + xxmrghd 32, 3, 2 + xxswapd 36, 0 + vperm 2, 1, 2, 6 + xxswapd 38, 1 + vpkudum 9, 0, 5 + xxmrgld 34, 34, 35 + lvx 3, 0, 7 + addis 7, 2, .LCPI1_4@toc@ha + addi 7, 7, .LCPI1_4@toc@l + vadduwm 4, 9, 4 + lvx 11, 0, 7 + addis 7, 2, .LCPI1_6@toc@ha + addi 7, 7, .LCPI1_6@toc@l + vadduwm 7, 4, 6 + lvx 4, 0, 6 + addis 6, 2, .LCPI1_3@toc@ha + addi 6, 6, .LCPI1_3@toc@l + vperm 11, 0, 5, 11 + lvx 0, 0, 7 + li 7, 32 + xxlxor 40, 39, 34 + lvx 10, 0, 6 + addis 6, 2, .LCPI1_5@toc@ha + lxvd2x 0, 4, 7 + vcmpgtsb 2, 1, 4 + addi 6, 6, .LCPI1_5@toc@l + vperm 4, 8, 8, 3 + vspltisw 8, 10 + xxlandc 44, 36, 34 + vadduwm 4, 8, 8 + vadduwm 8, 12, 10 + xxlxor 37, 40, 38 + vrlw 6, 5, 4 + vadduwm 5, 7, 11 + vadduwm 7, 6, 5 + lvx 5, 0, 6 + li 6, 48 + lxvd2x 1, 4, 6 + addis 4, 2, .LCPI1_7@toc@ha + xxlxor 42, 39, 44 + addi 4, 4, .LCPI1_7@toc@l + vcmpgtsb 5, 1, 5 + vperm 1, 10, 10, 0 + xxswapd 42, 0 + xxswapd 44, 1 + vpkudum 16, 12, 10 + xxlandc 47, 33, 37 + vsubuwm 1, 14, 13 + lvx 14, 0, 4 + addis 4, 2, .LCPI1_8@toc@ha + vadduwm 8, 15, 8 + xxswapd 45, 47 + addi 4, 4, .LCPI1_8@toc@l + xxlxor 38, 40, 38 + xxsldwi 40, 40, 40, 3 + vadduwm 7, 7, 16 + xxsldwi 48, 48, 48, 1 + vrlw 6, 6, 1 + xxsldwi 39, 39, 39, 1 + vperm 14, 10, 12, 14 + vadduwm 7, 6, 7 + xxlxor 45, 39, 45 + vperm 13, 13, 13, 3 + xxlandc 45, 45, 34 + vadduwm 8, 13, 8 + xxlxor 38, 40, 38 + vrlw 10, 6, 4 + vadduwm 6, 7, 14 + vadduwm 7, 10, 6 + xxlxor 38, 39, 45 + vperm 12, 6, 6, 0 + lvx 6, 0, 4 + addis 4, 2, .LCPI1_10@toc@ha + addi 4, 4, .LCPI1_10@toc@l + vperm 13, 11, 9, 6 + xxlandc 44, 44, 37 + vadduwm 15, 12, 8 + vadduwm 7, 7, 13 + xxsldwi 45, 45, 45, 3 + xxlxor 40, 47, 42 + xxsldwi 47, 47, 47, 1 + xxsldwi 39, 39, 39, 3 + vrlw 10, 8, 1 + xxswapd 40, 44 + vadduwm 17, 10, 7 + lvx 7, 0, 4 + addi 4, 11, .LCPI1_9@toc@l + xxlxor 44, 49, 40 + lvx 8, 0, 4 + addis 4, 2, .LCPI1_11@toc@ha + vperm 18, 9, 9, 7 + addi 4, 4, .LCPI1_11@toc@l + vperm 12, 12, 12, 3 + lvx 9, 0, 4 + addis 4, 2, .LCPI1_12@toc@ha + vperm 19, 14, 16, 8 + addi 4, 4, .LCPI1_12@toc@l + xxlandc 63, 44, 34 + vperm 12, 19, 18, 9 + vadduwm 15, 31, 15 + xxlxor 42, 47, 42 + vrlw 18, 10, 4 + vadduwm 10, 17, 12 + vadduwm 17, 18, 10 + xxlxor 42, 49, 63 + xxmrgld 63, 43, 46 + xxsldwi 49, 49, 49, 1 + vmrghw 14, 14, 11 + vperm 19, 10, 10, 0 + lvx 10, 0, 4 + addis 4, 2, .LCPI1_13@toc@ha + addi 4, 4, .LCPI1_13@toc@l + lvx 11, 0, 4 + addis 4, 2, .LCPI1_14@toc@ha + vperm 31, 16, 31, 10 + addi 4, 4, .LCPI1_14@toc@l + vperm 14, 14, 16, 11 + xxlandc 51, 51, 37 + vadduwm 15, 19, 15 + xxswapd 51, 51 + vadduwm 17, 17, 31 + xxlxor 50, 47, 50 + xxsldwi 47, 47, 47, 3 + vperm 30, 14, 31, 8 + vrlw 18, 18, 1 + vadduwm 17, 18, 17 + xxlxor 51, 49, 51 + vadduwm 17, 17, 14 + vperm 19, 19, 19, 3 + xxlandc 51, 51, 34 + vadduwm 15, 19, 15 + xxlxor 48, 47, 50 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 51 + vperm 19, 12, 13, 6 + vperm 18, 18, 18, 0 + vperm 13, 13, 13, 7 + vadduwm 17, 17, 19 + xxlandc 50, 50, 37 + xxsldwi 49, 49, 49, 3 + vperm 13, 30, 13, 9 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxmrgld 62, 44, 46 + vmrghw 12, 14, 12 + xxlxor 48, 47, 48 + xxsldwi 47, 47, 47, 1 + vrlw 16, 16, 1 + vperm 30, 31, 30, 10 + vperm 12, 12, 31, 11 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 13 + vperm 18, 18, 18, 3 + vperm 31, 12, 30, 8 + xxlandc 50, 50, 34 + vadduwm 15, 18, 15 + xxlxor 48, 47, 48 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + xxsldwi 49, 49, 49, 1 + vperm 18, 18, 18, 0 + vadduwm 17, 17, 30 + xxlandc 50, 50, 37 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxlxor 48, 47, 48 + xxsldwi 46, 47, 47, 3 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 12 + vperm 18, 18, 18, 3 + xxlandc 47, 50, 34 + xxsldwi 50, 51, 51, 3 + vadduwm 14, 15, 14 + vperm 19, 13, 18, 6 + xxlxor 48, 46, 48 + vperm 18, 18, 18, 7 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vadduwm 17, 17, 19 + vperm 15, 15, 15, 0 + xxsldwi 49, 49, 49, 3 + xxlandc 47, 47, 37 + vadduwm 14, 15, 14 + xxswapd 47, 47 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 29, 15, 14 + vperm 14, 31, 18, 9 + xxmrgld 50, 45, 44 + xxlxor 48, 61, 48 + vmrghw 12, 12, 13 + vrlw 16, 16, 4 + vperm 18, 30, 18, 10 + vadduwm 17, 17, 14 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + xxsldwi 49, 49, 49, 1 + vperm 15, 15, 15, 0 + vadduwm 17, 17, 18 + xxlandc 47, 47, 37 + vadduwm 31, 15, 29 + xxswapd 47, 47 + xxlxor 48, 63, 48 + xxsldwi 45, 63, 63, 3 + vperm 31, 12, 30, 11 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 13, 15, 13 + xxlxor 44, 45, 48 + vadduwm 16, 17, 31 + xxsldwi 49, 51, 51, 3 + vrlw 12, 12, 4 + vperm 19, 14, 17, 6 + vadduwm 16, 12, 16 + xxlxor 47, 48, 47 + vperm 15, 15, 15, 0 + xxlandc 47, 47, 37 + vadduwm 13, 15, 13 + xxswapd 47, 47 + xxlxor 44, 45, 44 + xxsldwi 45, 45, 45, 1 + vrlw 30, 12, 1 + vadduwm 12, 16, 19 + xxsldwi 44, 44, 44, 3 + vadduwm 16, 30, 12 + xxlxor 44, 48, 47 + vperm 15, 17, 17, 7 + vperm 12, 12, 12, 3 + vperm 17, 31, 18, 8 + xxlandc 61, 44, 34 + vperm 12, 17, 15, 9 + vadduwm 13, 29, 13 + xxlxor 47, 45, 62 + xxmrgld 62, 46, 63 + vmrghw 14, 31, 14 + vrlw 15, 15, 4 + vadduwm 16, 16, 12 + vperm 30, 18, 30, 10 + vperm 14, 14, 18, 11 + xxsldwi 50, 51, 51, 3 + vadduwm 16, 15, 16 + xxlxor 49, 48, 61 + xxsldwi 48, 48, 48, 1 + vperm 19, 12, 18, 6 + vperm 17, 17, 17, 0 + vadduwm 16, 16, 30 + xxmrgld 60, 44, 46 + vmrghw 12, 14, 12 + vperm 28, 30, 28, 10 + xxlandc 49, 49, 37 + vadduwm 13, 17, 13 + xxswapd 49, 49 + vperm 12, 12, 30, 11 + xxlxor 47, 45, 47 + xxsldwi 45, 45, 45, 3 + vrlw 15, 15, 1 + vperm 8, 12, 28, 8 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vadduwm 16, 16, 14 + vperm 17, 17, 17, 3 + xxlandc 49, 49, 34 + vadduwm 13, 17, 13 + xxlxor 47, 45, 47 + vrlw 15, 15, 4 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vperm 17, 17, 17, 0 + xxlandc 49, 49, 37 + vadduwm 31, 17, 13 + xxlxor 45, 63, 47 + vrlw 15, 13, 1 + vadduwm 13, 16, 19 + xxswapd 48, 49 + xxsldwi 51, 51, 51, 3 + xxsldwi 45, 45, 45, 3 + vadduwm 17, 15, 13 + xxlxor 45, 49, 48 + lvx 16, 0, 4 + vperm 29, 13, 13, 3 + vperm 13, 18, 18, 7 + xxsldwi 50, 63, 63, 1 + vperm 16, 14, 30, 16 + vperm 7, 19, 19, 7 + xxlandc 63, 61, 34 + vadduwm 18, 31, 18 + vperm 29, 16, 13, 9 + xxlxor 47, 50, 47 + vperm 6, 16, 19, 6 + vrlw 15, 15, 4 + vperm 7, 8, 7, 9 + vadduwm 17, 17, 29 + xxmrgld 41, 61, 44 + vadduwm 17, 15, 17 + vperm 9, 28, 9, 10 + xxlxor 63, 49, 63 + xxsldwi 49, 49, 49, 1 + vperm 31, 31, 31, 0 + vadduwm 17, 17, 28 + xxlandc 63, 63, 37 + vadduwm 18, 31, 18 + xxswapd 63, 63 + xxlxor 47, 50, 47 + xxsldwi 46, 50, 50, 3 + vrlw 15, 15, 1 + vadduwm 17, 15, 17 + xxlxor 63, 49, 63 + vadduwm 17, 17, 12 + vperm 31, 31, 31, 3 + xxlandc 50, 63, 34 + vadduwm 14, 18, 14 + xxlxor 47, 46, 47 + vrlw 15, 15, 4 + vadduwm 17, 15, 17 + xxlxor 50, 49, 50 + vadduwm 6, 17, 6 + vperm 18, 18, 18, 0 + xxsldwi 38, 38, 38, 3 + xxlandc 50, 50, 37 + vadduwm 14, 18, 14 + xxswapd 48, 50 + xxlxor 47, 46, 47 + xxsldwi 46, 46, 46, 1 + vrlw 15, 15, 1 + vadduwm 6, 15, 6 + xxlxor 48, 38, 48 + vadduwm 6, 6, 7 + vperm 16, 16, 16, 3 + xxlandc 48, 48, 34 + vadduwm 14, 16, 14 + xxlxor 40, 46, 47 + vrlw 8, 8, 4 + vadduwm 6, 8, 6 + xxlxor 39, 38, 48 + xxsldwi 38, 38, 38, 1 + vperm 7, 7, 7, 0 + vadduwm 6, 6, 9 + xxlandc 39, 39, 37 + vadduwm 14, 7, 14 + xxswapd 39, 39 + xxlxor 40, 46, 40 + xxsldwi 41, 46, 46, 3 + vrlw 8, 8, 1 + vadduwm 6, 8, 6 + xxlxor 39, 38, 39 + vperm 3, 7, 7, 3 + vmrghw 7, 12, 13 + xxlandc 34, 35, 34 + vperm 7, 7, 28, 11 + vadduwm 3, 2, 9 + xxlxor 40, 35, 40 + vrlw 4, 8, 4 + vadduwm 6, 6, 7 + vadduwm 6, 4, 6 + xxlxor 34, 38, 34 + xxsldwi 0, 38, 38, 3 + vperm 2, 2, 2, 0 + xxlandc 34, 34, 37 + vadduwm 3, 2, 3 + xxswapd 34, 34 + xxlxor 36, 35, 36 + xxsldwi 1, 35, 35, 1 + vrlw 4, 4, 1 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 2, 36, 34 + stxvd2x 0, 0, 8 + xxswapd 2, 2 + stxvd2x 2, 8, 5 + lfdx 0, 0, 3 + lfd 2, 8(3) + xxmrghd 35, 2, 0 + xxlxor 0, 1, 35 + xxswapd 0, 0 + stxvd2x 0, 8, 7 + lfd 0, 16(3) + lfd 1, 24(3) + li 3, -16 + xxmrghd 35, 1, 0 + xxlxor 0, 34, 35 + xxswapd 0, 0 + stxvd2x 0, 8, 6 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-.Lfunc_begin1 + .cfi_endproc + + .globl zfs_blake3_hash_many_sse41 + .p2align 2 + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: +.Lfunc_begin2: + .cfi_startproc +.Lfunc_gep2: + addis 2, 12, .TOC.-.Lfunc_gep2@ha + addi 2, 2, .TOC.-.Lfunc_gep2@l +.Lfunc_lep2: + .localentry zfs_blake3_hash_many_sse41, .Lfunc_lep2-.Lfunc_gep2 + mfocrf 12, 32 + mflr 0 + std 0, 16(1) + stw 12, 8(1) + stdu 1, -256(1) + .cfi_def_cfa_offset 256 + .cfi_offset lr, 16 + .cfi_offset r17, -120 + .cfi_offset r18, -112 + .cfi_offset r19, -104 + .cfi_offset r20, -96 + .cfi_offset r21, -88 + .cfi_offset r22, -80 + .cfi_offset r23, -72 + .cfi_offset r24, -64 + .cfi_offset r25, -56 + .cfi_offset r26, -48 + .cfi_offset r27, -40 + .cfi_offset r28, -32 + .cfi_offset r29, -24 + .cfi_offset r30, -16 + .cfi_offset cr2, 8 + std 26, 208(1) + mr 26, 4 + cmpldi 1, 4, 4 + andi. 4, 8, 1 + std 18, 144(1) + std 19, 152(1) + crmove 8, 1 + ld 19, 360(1) + lwz 18, 352(1) + std 24, 192(1) + std 25, 200(1) + std 27, 216(1) + std 28, 224(1) + mr 24, 10 + mr 28, 6 + mr 27, 5 + mr 25, 3 + std 29, 232(1) + std 30, 240(1) + mr 30, 9 + mr 29, 7 + std 17, 136(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + blt 1, .LBB2_3 + li 3, 0 + li 4, 1 + clrldi 23, 30, 32 + isel 22, 4, 3, 8 + clrldi 21, 24, 32 + clrldi 20, 18, 32 +.LBB2_2: + mr 3, 25 + mr 4, 27 + mr 5, 28 + mr 6, 29 + mr 7, 22 + mr 8, 23 + mr 9, 21 + mr 10, 20 + std 19, 32(1) + bl blake3_hash4_sse41 + addi 26, 26, -4 + addi 3, 29, 4 + addi 25, 25, 32 + addi 19, 19, 128 + cmpldi 26, 3 + isel 29, 3, 29, 8 + bgt 0, .LBB2_2 +.LBB2_3: + cmpldi 26, 0 + beq 0, .LBB2_11 + li 3, 0 + li 4, 1 + or 21, 24, 30 + li 20, 16 + addi 24, 1, 96 + isel 22, 4, 3, 8 +.LBB2_5: + lxvd2x 0, 28, 20 + ld 23, 0(25) + mr 17, 27 + mr 3, 21 + stxvd2x 0, 24, 20 + lxvd2x 0, 0, 28 + stxvd2x 0, 0, 24 +.LBB2_6: + cmpldi 17, 1 + beq 0, .LBB2_8 + cmpldi 17, 0 + bne 0, .LBB2_9 + b .LBB2_10 +.LBB2_8: + or 3, 3, 18 +.LBB2_9: + clrldi 7, 3, 56 + mr 3, 24 + mr 4, 23 + li 5, 64 + mr 6, 29 + bl zfs_blake3_compress_in_place_sse41 + addi 23, 23, 64 + addi 17, 17, -1 + mr 3, 30 + b .LBB2_6 +.LBB2_10: + lxvd2x 0, 24, 20 + addi 26, 26, -1 + add 29, 29, 22 + addi 25, 25, 8 + cmpldi 26, 0 + stxvd2x 0, 19, 20 + lxvd2x 0, 0, 24 + stxvd2x 0, 0, 19 + addi 19, 19, 32 + bne 0, .LBB2_5 +.LBB2_11: + ld 30, 240(1) + ld 29, 232(1) + ld 28, 224(1) + ld 27, 216(1) + ld 26, 208(1) + ld 25, 200(1) + ld 24, 192(1) + ld 23, 184(1) + ld 22, 176(1) + ld 21, 168(1) + ld 20, 160(1) + ld 19, 152(1) + ld 18, 144(1) + ld 17, 136(1) + addi 1, 1, 256 + ld 0, 16(1) + lwz 12, 8(1) + mtocrf 32, 12 + mtlr 0 + blr + .long 0 + .quad 0 +.Lfunc_end2: + .size zfs_blake3_hash_many_sse41, .Lfunc_end2-.Lfunc_begin2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI3_0: + .quad 4294967296 + .quad 12884901890 +.LCPI3_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI3_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI3_3: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI3_4: + .long 1779033703 + .long 1779033703 + .long 1779033703 + .long 1779033703 +.LCPI3_5: + .long 3144134277 + .long 3144134277 + .long 3144134277 + .long 3144134277 +.LCPI3_6: + .long 1013904242 + .long 1013904242 + .long 1013904242 + .long 1013904242 +.LCPI3_7: + .long 2773480762 + .long 2773480762 + .long 2773480762 + .long 2773480762 +.LCPI3_8: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .text + .p2align 2 + .type blake3_hash4_sse41,@function +blake3_hash4_sse41: +.Lfunc_begin3: + .cfi_startproc +.Lfunc_gep3: + addis 2, 12, .TOC.-.Lfunc_gep3@ha + addi 2, 2, .TOC.-.Lfunc_gep3@l +.Lfunc_lep3: + .localentry blake3_hash4_sse41, .Lfunc_lep3-.Lfunc_gep3 + stdu 1, -416(1) + .cfi_def_cfa_offset 416 + .cfi_offset r22, -176 + .cfi_offset r23, -168 + .cfi_offset r24, -160 + .cfi_offset r25, -152 + .cfi_offset r26, -144 + .cfi_offset r27, -136 + .cfi_offset r28, -128 + .cfi_offset r29, -120 + .cfi_offset r30, -112 + .cfi_offset f20, -96 + .cfi_offset f21, -88 + .cfi_offset f22, -80 + .cfi_offset f23, -72 + .cfi_offset f24, -64 + .cfi_offset f25, -56 + .cfi_offset f26, -48 + .cfi_offset f27, -40 + .cfi_offset f28, -32 + .cfi_offset f29, -24 + .cfi_offset f30, -16 + .cfi_offset f31, -8 + .cfi_offset v20, -368 + .cfi_offset v21, -352 + .cfi_offset v22, -336 + .cfi_offset v23, -320 + .cfi_offset v24, -304 + .cfi_offset v25, -288 + .cfi_offset v26, -272 + .cfi_offset v27, -256 + .cfi_offset v28, -240 + .cfi_offset v29, -224 + .cfi_offset v30, -208 + .cfi_offset v31, -192 + li 11, 48 + li 0, 8 + std 30, 304(1) + li 30, 12 + li 12, 4 + lfiwzx 0, 0, 5 + stxvd2x 52, 1, 11 + li 11, 64 + lfiwzx 2, 5, 0 + li 0, 20 + lfiwzx 3, 5, 30 + stxvd2x 53, 1, 11 + li 11, 80 + li 30, 24 + lfiwzx 4, 5, 0 + li 0, 28 + stxvd2x 54, 1, 11 + li 11, 96 + lfiwzx 1, 5, 12 + lfiwzx 6, 5, 30 + xxspltw 47, 0, 1 + cmpldi 4, 0 + std 22, 240(1) + stxvd2x 55, 1, 11 + li 11, 112 + lfiwzx 7, 5, 0 + xxspltw 40, 2, 1 + std 23, 248(1) + xxspltw 39, 3, 1 + std 24, 256(1) + std 25, 264(1) + xxspltw 51, 1, 1 + xxspltw 43, 6, 1 + std 26, 272(1) + xxspltw 41, 7, 1 + std 27, 280(1) + std 28, 288(1) + std 29, 296(1) + stxvd2x 56, 1, 11 + li 11, 128 + stfd 20, 320(1) + stxvd2x 57, 1, 11 + li 11, 144 + stfd 21, 328(1) + stxvd2x 58, 1, 11 + li 11, 160 + stfd 22, 336(1) + stxvd2x 59, 1, 11 + li 11, 176 + stfd 23, 344(1) + stxvd2x 60, 1, 11 + li 11, 192 + stfd 24, 352(1) + stxvd2x 61, 1, 11 + li 11, 208 + stfd 25, 360(1) + stxvd2x 62, 1, 11 + li 11, 224 + stfd 26, 368(1) + stxvd2x 63, 1, 11 + li 11, 16 + xxspltw 63, 4, 1 + lfiwzx 5, 5, 11 + ld 5, 448(1) + stfd 27, 376(1) + stfd 28, 384(1) + stfd 29, 392(1) + stfd 30, 400(1) + stfd 31, 408(1) + xxspltw 50, 5, 1 + beq 0, .LBB3_5 + addis 30, 2, .LCPI3_0@toc@ha + neg 7, 7 + xxleqv 34, 34, 34 + addis 28, 2, .LCPI3_5@toc@ha + addis 27, 2, .LCPI3_6@toc@ha + addis 26, 2, .LCPI3_7@toc@ha + addis 29, 2, .LCPI3_4@toc@ha + addis 25, 2, .LCPI3_8@toc@ha + addi 0, 30, .LCPI3_0@toc@l + mtfprwz 2, 7 + addis 7, 2, .LCPI3_1@toc@ha + addis 30, 2, .LCPI3_3@toc@ha + addi 24, 29, .LCPI3_4@toc@l + ld 29, 24(3) + lxvd2x 1, 0, 0 + mtfprwz 0, 6 + rldicl 6, 6, 32, 32 + addi 0, 30, .LCPI3_3@toc@l + ld 30, 16(3) + xxspltw 2, 2, 1 + vslw 2, 2, 2 + xxspltw 37, 0, 1 + mtfprwz 0, 6 + addi 6, 7, .LCPI3_1@toc@l + addis 7, 2, .LCPI3_2@toc@ha + xxswapd 35, 1 + xxlxor 36, 36, 36 + xxspltw 33, 0, 1 + xxland 35, 2, 35 + vadduwm 0, 3, 5 + lvx 5, 0, 6 + addi 6, 7, .LCPI3_2@toc@l + ld 7, 8(3) + xxlor 35, 35, 34 + xxlxor 34, 32, 34 + xxlor 9, 32, 32 + lvx 0, 0, 6 + ld 6, 0(3) + addi 3, 3, -8 + vcmpgtsw 2, 3, 2 + lvx 3, 0, 0 + addi 0, 28, .LCPI3_5@toc@l + addi 28, 27, .LCPI3_6@toc@l + addi 27, 26, .LCPI3_7@toc@l + addi 26, 25, .LCPI3_8@toc@l + or 25, 9, 8 + li 9, 0 + vcmpgtsb 5, 4, 5 + vcmpgtsb 0, 4, 0 + xxlor 11, 35, 35 + lvx 3, 0, 24 + xxlor 12, 35, 35 + vsubuwm 2, 1, 2 + xxlnor 10, 37, 37 + xxlor 13, 34, 34 + lvx 2, 0, 0 + li 0, 32 + xxlnor 31, 32, 32 + xxlor 30, 34, 34 + lvx 2, 0, 28 + li 28, 48 + xxlor 29, 34, 34 + lvx 2, 0, 27 + li 27, 0 + xxlor 28, 34, 34 + lvx 2, 0, 26 + xxlor 27, 34, 34 +.LBB3_2: + mr 26, 27 + addi 27, 27, 1 + xxlor 23, 39, 39 + cmpld 27, 4 + sldi 26, 26, 6 + xxlor 24, 40, 40 + iseleq 24, 10, 9 + add 23, 6, 26 + add 22, 30, 26 + lxvd2x 0, 6, 26 + lxvd2x 1, 7, 26 + or 25, 24, 25 + add 24, 7, 26 + lxvd2x 2, 30, 26 + lxvd2x 3, 29, 26 + xxlor 26, 47, 47 + lxvd2x 4, 23, 11 + lxvd2x 6, 24, 11 + clrlwi 25, 25, 24 + xxlor 25, 51, 51 + lxvd2x 7, 22, 11 + lxvd2x 8, 23, 0 + mtfprd 5, 25 + add 25, 29, 26 + xxswapd 34, 0 + lxvd2x 0, 25, 11 + xxswapd 38, 1 + xxswapd 32, 2 + lxvd2x 1, 24, 0 + lxvd2x 2, 22, 0 + xxswapd 40, 3 + xxswapd 39, 4 + lxvd2x 3, 25, 0 + lxvd2x 4, 23, 28 + xxswapd 60, 6 + xxswapd 47, 7 + lxvd2x 6, 24, 28 + xxswapd 57, 8 + lxvd2x 7, 22, 28 + lxvd2x 8, 25, 28 + xxswapd 58, 0 + mr 25, 3 + xxswapd 53, 1 + xxswapd 56, 2 + xxswapd 52, 3 + xxswapd 55, 4 + xxswapd 54, 6 + xxswapd 0, 5 + xxswapd 42, 7 + xxswapd 48, 8 + mtctr 12 +.LBB3_3: + ldu 24, 8(25) + add 24, 24, 26 + addi 24, 24, 256 + dcbt 0, 24 + bdnz .LBB3_3 + vmrgew 4, 28, 7 + vspltisw 14, 9 + mr 25, 8 + vmrgew 27, 6, 2 + vspltisw 17, 4 + vmrglw 12, 6, 2 + vspltisw 19, 10 + vmrghw 30, 6, 2 + xxspltw 0, 0, 3 + vmrglw 2, 8, 0 + vmrghw 13, 8, 0 + xxlor 7, 36, 36 + vmrgew 4, 21, 25 + vmrglw 29, 28, 7 + vmrghw 1, 28, 7 + vmrglw 28, 26, 15 + xxmrgld 37, 34, 44 + vmrgew 7, 26, 15 + vmrghw 15, 26, 15 + xxlor 21, 36, 36 + vmrglw 4, 21, 25 + vmrghw 21, 21, 25 + vmrglw 25, 20, 24 + xxmrgld 34, 60, 61 + vmrghw 26, 20, 24 + xxlor 38, 26, 26 + vmrgew 3, 8, 0 + xxlor 5, 36, 36 + vmrgew 4, 20, 24 + vspltisw 24, -16 + vmrglw 20, 22, 23 + xxmrgld 57, 57, 5 + vmrglw 8, 16, 10 + vmrghw 0, 16, 10 + vadduwm 12, 19, 19 + xxlor 8, 37, 37 + xxlor 20, 36, 36 + vmrgew 4, 22, 23 + vmrghw 23, 22, 23 + xxmrgld 40, 40, 52 + vmrgew 22, 16, 10 + vsubuwm 10, 14, 24 + vslw 14, 17, 17 + vadduwm 17, 5, 6 + xxmrgld 37, 47, 33 + xxlor 22, 36, 36 + xxmrgld 36, 45, 62 + xxlor 38, 25, 25 + xxlor 2, 34, 34 + vadduwm 19, 4, 6 + xxmrgld 38, 39, 7 + xxlor 3, 36, 36 + xxmrghd 39, 47, 33 + xxlor 36, 24, 24 + xxmrgld 33, 58, 53 + vadduwm 17, 17, 18 + vadduwm 29, 2, 4 + xxmrgld 36, 35, 59 + xxlor 34, 23, 23 + xxmrghd 35, 45, 62 + xxlor 1, 9, 9 + vadduwm 28, 5, 2 + xxlor 1, 13, 13 + vadduwm 19, 19, 31 + vadduwm 24, 29, 11 + vadduwm 28, 28, 9 + xxlxor 61, 49, 9 + xxlor 1, 41, 41 + xxlor 41, 11, 11 + xxlxor 34, 51, 13 + vperm 29, 29, 29, 9 + xxlxor 46, 56, 46 + vperm 2, 2, 2, 9 + xxlxor 59, 60, 0 + vperm 14, 14, 14, 9 + vperm 30, 27, 27, 9 + vadduwm 19, 19, 3 + xxlor 4, 35, 35 + xxland 61, 61, 10 + xxlor 35, 12, 12 + xxland 34, 34, 10 + vadduwm 27, 29, 3 + xxlor 35, 30, 30 + vadduwm 17, 17, 4 + xxlor 26, 36, 36 + xxland 46, 46, 10 + vadduwm 3, 2, 3 + xxlor 36, 29, 29 + xxland 62, 62, 10 + xxlxor 45, 59, 50 + xxlxor 50, 35, 63 + vadduwm 31, 14, 4 + xxlor 36, 28, 28 + xxlor 6, 37, 37 + vadduwm 16, 30, 4 + xxlxor 43, 63, 43 + xxlxor 37, 48, 1 + vrlw 4, 13, 12 + vrlw 18, 18, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 15, 24, 6 + vadduwm 28, 28, 7 + vadduwm 17, 4, 17 + vadduwm 19, 18, 19 + vadduwm 15, 11, 15 + vadduwm 28, 5, 28 + xxlor 25, 38, 38 + xxlxor 61, 49, 61 + xxlxor 34, 51, 34 + xxlxor 46, 47, 46 + xxlxor 62, 60, 62 + xxlor 38, 27, 27 + vadduwm 19, 19, 1 + vperm 29, 29, 29, 6 + vperm 2, 2, 2, 6 + vperm 24, 14, 14, 6 + vperm 30, 30, 30, 6 + xxlor 5, 33, 33 + vadduwm 17, 17, 25 + xxland 61, 61, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + xxland 62, 62, 31 + vadduwm 27, 29, 27 + vadduwm 3, 2, 3 + vadduwm 31, 24, 31 + vadduwm 16, 30, 16 + xxlxor 36, 59, 36 + xxlxor 50, 35, 50 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 18, 10 + xxmrgld 50, 32, 55 + vrlw 11, 11, 10 + xxmrghd 55, 32, 55 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 15, 15, 8 + vadduwm 28, 28, 18 + vadduwm 17, 1, 17 + vadduwm 19, 11, 19 + vadduwm 15, 5, 15 + vadduwm 28, 4, 28 + xxlor 7, 57, 57 + xxlxor 62, 49, 62 + xxlxor 61, 51, 61 + xxlxor 57, 47, 34 + xxlxor 34, 60, 56 + vperm 24, 30, 30, 9 + xxmrgld 62, 20, 21 + vperm 29, 29, 29, 9 + vperm 25, 25, 25, 9 + vperm 2, 2, 2, 9 + vmr 14, 8 + xxmrghd 40, 58, 53 + xxmrgld 58, 54, 22 + vadduwm 17, 17, 30 + xxland 56, 56, 10 + vadduwm 21, 19, 8 + xxland 61, 61, 10 + xxland 51, 57, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vadduwm 0, 15, 26 + vadduwm 15, 28, 23 + vadduwm 17, 1, 17 + vadduwm 28, 11, 21 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vmr 13, 8 + xxlor 53, 3, 3 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 52, 4, 4 + xxlor 40, 2, 2 + vadduwm 17, 17, 21 + vadduwm 28, 28, 20 + vadduwm 0, 0, 7 + vadduwm 15, 15, 8 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + vmr 25, 26 + xxlor 3, 39, 39 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 54, 6, 6 + xxlor 58, 5, 5 + xxlor 39, 8, 8 + vadduwm 17, 17, 22 + vadduwm 28, 28, 26 + vadduwm 0, 0, 7 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 39, 26, 26 + vadduwm 28, 28, 14 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 7 + vadduwm 0, 0, 30 + vadduwm 15, 15, 23 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 24, 55, 55 + vadduwm 17, 17, 13 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vmr 23, 13 + xxlor 45, 25, 25 + xxlor 39, 7, 7 + vadduwm 28, 28, 13 + vadduwm 0, 0, 18 + vadduwm 15, 15, 7 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 2, 46, 46 + xxlor 46, 3, 3 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vadduwm 17, 17, 20 + vadduwm 28, 28, 26 + vadduwm 0, 0, 25 + vadduwm 15, 15, 14 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 52, 2, 2 + vadduwm 17, 17, 8 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 28, 28, 20 + vadduwm 0, 0, 21 + vadduwm 15, 15, 18 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + vadduwm 17, 17, 22 + vadduwm 28, 28, 30 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 0, 0, 23 + vadduwm 15, 15, 7 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 5, 4, 4 + xxlor 4, 58, 58 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 39, 8, 8 + xxlor 54, 24, 24 + xxlor 58, 26, 26 + vadduwm 17, 17, 13 + vadduwm 28, 28, 7 + vadduwm 0, 0, 22 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 3, 53, 53 + xxlor 53, 4, 4 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vadduwm 17, 17, 21 + vadduwm 28, 28, 20 + vadduwm 0, 0, 18 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 2, 55, 55 + vmr 23, 18 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 50, 5, 5 + vadduwm 17, 17, 14 + vadduwm 28, 28, 30 + vadduwm 0, 0, 18 + vadduwm 15, 15, 22 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 25, 40, 40 + vmr 8, 13 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + xxlor 45, 25, 25 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 13 + xxlor 45, 2, 2 + vadduwm 0, 0, 8 + vadduwm 28, 28, 13 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 4, 57, 57 + xxlor 26, 46, 46 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 8, 62, 62 + xxlor 57, 3, 3 + xxlor 46, 7, 7 + xxlor 62, 6, 6 + vadduwm 17, 17, 7 + vadduwm 28, 28, 25 + vadduwm 0, 0, 14 + vadduwm 15, 15, 30 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vadduwm 17, 17, 20 + xxlor 3, 52, 52 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 52, 8, 8 + vadduwm 0, 0, 22 + vadduwm 28, 28, 20 + vadduwm 15, 15, 23 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 6, 55, 55 + xxlor 55, 4, 4 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 17, 17, 23 + vadduwm 28, 28, 13 + vadduwm 0, 0, 21 + vadduwm 15, 15, 14 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 4, 53, 53 + xxlor 53, 26, 26 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 21 + vadduwm 28, 28, 8 + vadduwm 0, 0, 7 + vadduwm 15, 15, 30 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 5, 25, 25 + xxlor 2, 58, 58 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vmr 22, 26 + vadduwm 0, 0, 26 + xxlor 58, 5, 5 + vadduwm 17, 17, 25 + vadduwm 28, 28, 18 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 7, 24, 24 + xxlor 8, 57, 57 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 57, 7, 7 + vadduwm 17, 17, 20 + vadduwm 28, 28, 13 + vadduwm 0, 0, 14 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 5, 52, 52 + xxlor 23, 45, 45 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 52, 6, 6 + vadduwm 28, 28, 8 + vmr 13, 8 + xxlor 40, 3, 3 + vadduwm 17, 17, 20 + vadduwm 0, 0, 8 + vadduwm 15, 15, 22 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 25, 39, 39 + vmr 7, 30 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vmr 30, 18 + xxlor 24, 46, 46 + xxlor 46, 25, 25 + xxlor 50, 8, 8 + vadduwm 17, 17, 23 + vadduwm 28, 28, 14 + vadduwm 0, 0, 18 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 6, 58, 58 + xxlor 58, 4, 4 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vadduwm 17, 17, 30 + vadduwm 28, 28, 26 + vadduwm 0, 0, 7 + vadduwm 15, 15, 21 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 40, 23, 23 + vadduwm 13, 28, 13 + vadduwm 8, 17, 8 + xxland 49, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 17, 31 + vadduwm 16, 29, 16 + vadduwm 28, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 60, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 2, 55, 55 + vmr 23, 30 + xxlor 62, 24, 24 + vadduwm 0, 0, 22 + vadduwm 15, 15, 30 + vadduwm 8, 4, 8 + vadduwm 13, 1, 13 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 40, 61 + xxlxor 51, 45, 51 + xxlxor 34, 32, 34 + xxlxor 49, 47, 49 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 17, 17, 17, 9 + vadduwm 13, 13, 14 + xxlor 46, 5, 5 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 49, 49, 10 + vadduwm 28, 29, 28 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 17, 16 + xxlxor 36, 60, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 8, 8, 25 + vadduwm 0, 0, 14 + vadduwm 15, 15, 7 + vadduwm 8, 4, 8 + vadduwm 13, 1, 13 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 62, 40, 61 + xxlxor 51, 45, 51 + xxlxor 34, 32, 34 + xxlxor 49, 47, 49 + vperm 30, 30, 30, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 17, 17, 17, 6 + vadduwm 29, 8, 20 + vadduwm 8, 13, 18 + xxland 45, 62, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 49, 49, 31 + vadduwm 30, 13, 28 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 17, 16 + xxlxor 36, 62, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 0, 0, 23 + vadduwm 7, 15, 21 + vadduwm 29, 1, 29 + vadduwm 8, 11, 8 + vadduwm 0, 5, 0 + vadduwm 7, 4, 7 + xxlxor 47, 61, 49 + xxlxor 45, 40, 45 + xxlxor 49, 32, 51 + xxlxor 34, 39, 34 + vperm 15, 15, 15, 9 + vperm 13, 13, 13, 9 + vperm 17, 17, 17, 9 + vperm 2, 2, 2, 9 + xxlor 46, 3, 3 + vadduwm 9, 29, 26 + vadduwm 8, 8, 14 + xxland 46, 47, 10 + xxland 45, 45, 10 + xxland 47, 49, 10 + xxland 34, 34, 10 + vadduwm 17, 14, 31 + vadduwm 16, 13, 16 + vadduwm 18, 15, 30 + vadduwm 3, 2, 3 + xxlxor 33, 49, 33 + xxlxor 43, 48, 43 + xxlxor 37, 50, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 44, 6, 6 + xxlor 0, 10, 10 + vadduwm 0, 0, 12 + xxlor 44, 2, 2 + vadduwm 9, 1, 9 + vadduwm 7, 7, 12 + vadduwm 8, 11, 8 + vadduwm 7, 4, 7 + vadduwm 0, 5, 0 + xxlxor 34, 39, 34 + xxlxor 44, 32, 47 + vperm 2, 2, 2, 6 + xxlxor 46, 41, 46 + xxlxor 45, 40, 45 + vperm 12, 12, 12, 6 + vperm 14, 14, 14, 6 + vperm 13, 13, 13, 6 + xxland 34, 34, 31 + xxlor 1, 31, 31 + vadduwm 3, 2, 3 + xxland 44, 44, 31 + xxlxor 36, 35, 36 + xxlxor 51, 35, 40 + xxland 35, 46, 31 + xxland 38, 45, 31 + vadduwm 15, 12, 18 + vadduwm 8, 3, 17 + vadduwm 13, 6, 16 + xxlxor 37, 47, 37 + xxlxor 33, 40, 33 + xxlxor 43, 45, 43 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlxor 47, 47, 41 + xxlxor 40, 40, 32 + xxlxor 39, 45, 39 + xxlxor 50, 36, 38 + xxlxor 63, 33, 44 + xxlxor 43, 43, 34 + xxlxor 41, 37, 35 + bne 0, .LBB3_2 +.LBB3_5: + vmrglw 2, 19, 15 + li 3, 32 + li 4, 48 + vmrglw 4, 7, 8 + vmrglw 0, 31, 18 + vmrglw 1, 9, 11 + vmrghw 3, 19, 15 + vmrghw 5, 7, 8 + vmrghw 6, 31, 18 + vmrghw 7, 9, 11 + xxmrgld 40, 36, 34 + xxmrghd 34, 36, 34 + xxmrgld 41, 33, 32 + xxswapd 0, 40 + xxmrgld 36, 37, 35 + xxmrghd 35, 37, 35 + xxmrghd 37, 33, 32 + xxswapd 1, 41 + xxmrgld 32, 39, 38 + xxmrghd 33, 39, 38 + xxswapd 2, 34 + xxswapd 4, 36 + xxswapd 3, 37 + stxvd2x 0, 0, 5 + xxswapd 5, 32 + stxvd2x 1, 5, 11 + xxswapd 0, 35 + xxswapd 1, 33 + stxvd2x 2, 5, 3 + li 3, 64 + stxvd2x 3, 5, 4 + li 4, 80 + stxvd2x 4, 5, 3 + li 3, 96 + stxvd2x 5, 5, 4 + li 4, 112 + stxvd2x 0, 5, 3 + stxvd2x 1, 5, 4 + li 3, 224 + lxvd2x 63, 1, 3 + li 3, 208 + lfd 31, 408(1) + ld 30, 304(1) + ld 29, 296(1) + lxvd2x 62, 1, 3 + li 3, 192 + lfd 30, 400(1) + ld 28, 288(1) + ld 27, 280(1) + lxvd2x 61, 1, 3 + li 3, 176 + lfd 29, 392(1) + ld 26, 272(1) + ld 25, 264(1) + lxvd2x 60, 1, 3 + li 3, 160 + lfd 28, 384(1) + ld 24, 256(1) + ld 23, 248(1) + lxvd2x 59, 1, 3 + li 3, 144 + lfd 27, 376(1) + ld 22, 240(1) + lxvd2x 58, 1, 3 + li 3, 128 + lfd 26, 368(1) + lxvd2x 57, 1, 3 + li 3, 112 + lfd 25, 360(1) + lxvd2x 56, 1, 3 + li 3, 96 + lfd 24, 352(1) + lxvd2x 55, 1, 3 + li 3, 80 + lfd 23, 344(1) + lxvd2x 54, 1, 3 + li 3, 64 + lfd 22, 336(1) + lxvd2x 53, 1, 3 + li 3, 48 + lfd 21, 328(1) + lxvd2x 52, 1, 3 + lfd 20, 320(1) + addi 1, 1, 416 + blr + .long 0 + .quad 0 +.Lfunc_end3: + .size blake3_hash4_sse41, .Lfunc_end3-.Lfunc_begin3 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S new file mode 100644 index 000000000..b15d8fc77 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -0,0 +1,1845 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_AVX2) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_avx2 +.text + +.type zfs_blake3_hash_many_avx2,@function +.p2align 6 +zfs_blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif + +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A +#endif /* HAVE_AVX2 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S new file mode 100644 index 000000000..d02c5e7ec --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S @@ -0,0 +1,2618 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_avx512 +.global zfs_blake3_compress_in_place_avx512 +.global zfs_blake3_compress_xof_avx512 +.text + +.type zfs_blake3_hash_many_avx512,@function +.type zfs_blake3_compress_xof_avx512,@function +.type zfs_blake3_compress_in_place_avx512,@function + +.p2align 6 +zfs_blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti32x8 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +zfs_blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +zfs_blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512 +.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512 +.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif + +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A + +#endif /* HAVE_AVX512 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S new file mode 100644 index 000000000..39d23ee23 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S @@ -0,0 +1,2323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_SSE2) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_sse2 +.global zfs_blake3_compress_in_place_sse2 +.global zfs_blake3_compress_xof_sse2 + +.text +.type zfs_blake3_hash_many_sse2,@function +.type zfs_blake3_compress_in_place_sse2,@function +.type zfs_blake3_compress_xof_sse2,@function + + .p2align 6 +zfs_blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +zfs_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +zfs_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2 +.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2 +.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF + +#endif /* HAVE_SSE2 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S new file mode 100644 index 000000000..1c40236f0 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S @@ -0,0 +1,2058 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt <[email protected]> + */ + +#if defined(HAVE_SSE4_1) + +#define _ASM +#include <sys/asm_linkage.h> + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include(<cet.h>) +#include <cet.h> +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_compress_in_place_sse41 +.global zfs_blake3_compress_xof_sse41 +.global zfs_blake3_hash_many_sse41 + +.text +.type zfs_blake3_hash_many_sse41,@function +.type zfs_blake3_compress_in_place_sse41,@function +.type zfs_blake3_compress_xof_sse41,@function + +.p2align 6 +zfs_blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +zfs_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret +.p2align 6 +zfs_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41 +.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41 +.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + +#endif /* HAVE_SSE4_1 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif |