From 985c33b132f6c23a69bd808e008ae0f46131a31e Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Thu, 9 Jun 2022 00:55:57 +0200 Subject: Introduce BLAKE3 checksums as an OpenZFS feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit adds BLAKE3 checksums to OpenZFS, it has similar performance to Edon-R, but without the caveats around the latter. Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3 Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3 Short description of Wikipedia: BLAKE3 is a cryptographic hash function based on Bao and BLAKE2, created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real World Crypto. BLAKE3 is a single algorithm with many desirable features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE and BLAKE2, which are algorithm families with multiple variants. BLAKE3 has a binary tree structure, so it supports a practically unlimited degree of parallelism (both SIMD and multithreading) given enough input. The official Rust and C implementations are dual-licensed as public domain (CC0) and the Apache License. Along with adding the BLAKE3 hash into the OpenZFS infrastructure a new benchmarking file called chksum_bench was introduced. When read it reports the speed of the available checksum functions. On Linux: cat /proc/spl/kstat/zfs/chksum_bench On FreeBSD: sysctl kstat.zfs.misc.chksum_bench This is an example output of an i3-1005G1 test system with Debian 11: implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1196 1602 1761 1749 1762 1759 1751 skein-generic 546 591 608 615 619 612 616 sha256-generic 240 300 316 314 304 285 276 sha512-generic 353 441 467 476 472 467 426 blake3-generic 308 313 313 313 312 313 312 blake3-sse2 402 1289 1423 1446 1432 1458 1413 blake3-sse41 427 1470 1625 1704 1679 1607 1629 blake3-avx2 428 1920 3095 3343 3356 3318 3204 blake3-avx512 473 2687 4905 5836 5844 5643 5374 Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1840 2458 2665 2719 2711 2723 2693 skein-generic 870 966 996 992 1003 1005 1009 sha256-generic 415 442 453 455 457 457 457 sha512-generic 608 690 711 718 719 720 721 blake3-generic 301 313 311 309 309 310 310 blake3-sse2 343 1865 2124 2188 2180 2181 2186 blake3-sse41 364 2091 2396 2509 2463 2482 2488 blake3-avx2 365 2590 4399 4971 4915 4802 4764 Output on Debian 5.10.0-9-powerpc64le system: (POWER 9) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1213 1703 1889 1918 1957 1902 1907 skein-generic 434 492 520 522 511 525 525 sha256-generic 167 183 187 188 188 187 188 sha512-generic 186 216 222 221 225 224 224 blake3-generic 153 152 154 153 151 153 153 blake3-sse2 391 1170 1366 1406 1428 1426 1414 blake3-sse41 352 1049 1212 1174 1262 1258 1259 Output on Debian 5.10.0-11-arm64 system: (Pi400) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 487 603 629 639 643 641 641 skein-generic 271 299 303 308 309 309 307 sha256-generic 117 127 128 130 130 129 130 sha512-generic 145 165 170 172 173 174 175 blake3-generic 81 29 71 89 89 89 89 blake3-sse2 112 323 368 379 380 371 374 blake3-sse41 101 315 357 368 369 364 360 Structurally, the new code is mainly split into these parts: - 1x cross platform generic c variant: blake3_generic.c - 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512) - 2x assembly for ARMv8 (NEON converted from SSE2) - 2x assembly for PPC64-LE (POWER8 converted from SSE2) - one file for switching between the implementations Note the PPC64 assembly requires the VSX instruction set and the kfpu_begin() / kfpu_end() calls on PowerPC were updated accordingly. Reviewed-by: Felix Dörre Reviewed-by: Ahelenia Ziemiańska Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Co-authored-by: Rich Ercolani Closes #10058 Closes #12918 --- module/icp/algs/blake3/blake3.c | 732 ++++++++++++++++++++++++++++++++ module/icp/algs/blake3/blake3_generic.c | 202 +++++++++ module/icp/algs/blake3/blake3_impl.c | 256 +++++++++++ module/icp/algs/blake3/blake3_impl.h | 213 ++++++++++ module/icp/algs/blake3/blake3_x86-64.c | 248 +++++++++++ 5 files changed, 1651 insertions(+) create mode 100644 module/icp/algs/blake3/blake3.c create mode 100644 module/icp/algs/blake3/blake3_generic.c create mode 100644 module/icp/algs/blake3/blake3_impl.c create mode 100644 module/icp/algs/blake3/blake3_impl.h create mode 100644 module/icp/algs/blake3/blake3_x86-64.c (limited to 'module/icp/algs') diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c new file mode 100644 index 000000000..8c9c06eb9 --- /dev/null +++ b/module/icp/algs/blake3/blake3.c @@ -0,0 +1,732 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include +#include + +#include "blake3_impl.h" + +/* + * We need 1056 byte stack for blake3_compress_subtree_wide() + * - we define this pragma to make gcc happy + */ +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +/* internal used */ +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +/* internal flags */ +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +/* internal start */ +static void chunk_state_init(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint8_t flags) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; + ctx->blocks_compressed = 0; + ctx->flags = flags; +} + +static void chunk_state_reset(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint64_t chunk_counter) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = chunk_counter; + ctx->blocks_compressed = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; +} + +static size_t chunk_state_len(const blake3_chunk_state_t *ctx) +{ + return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) + + ((size_t)ctx->buf_len); +} + +static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx, + const uint8_t *input, size_t input_len) +{ + size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len); + memcpy(dest, input, take); + ctx->buf_len += (uint8_t)take; + return (take); +} + +static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx) +{ + if (ctx->blocks_compressed == 0) { + return (CHUNK_START); + } else { + return (0); + } +} + +static output_t make_output(const uint32_t input_cv[8], + const uint8_t *block, uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return (ret); +} + +/* + * Chaining values within a given chunk (specifically the compress_in_place + * interface) are represented as words. This avoids unnecessary bytes<->words + * conversion overhead in the portable implementation. However, the hash_many + * interface handles both user input and parent node blocks, so it accepts + * bytes. For that reason, chaining values in the CV stack are represented as + * bytes. + */ +static void output_chaining_value(const blake3_impl_ops_t *ops, + const output_t *ctx, uint8_t cv[32]) +{ + uint32_t cv_words[8]; + memcpy(cv_words, ctx->input_cv, 32); + ops->compress_in_place(cv_words, ctx->block, ctx->block_len, + ctx->counter, ctx->flags); + store_cv_words(cv, cv_words); +} + +static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, + uint64_t seek, uint8_t *out, size_t out_len) +{ + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len, + output_block_counter, ctx->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +static void chunk_state_update(const blake3_impl_ops_t *ops, + blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) +{ + if (ctx->buf_len > 0) { + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + ops->compress_in_place(ctx->cv, ctx->buf, + BLAKE3_BLOCK_LEN, ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + ctx->buf_len = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN, + ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; +} + +static output_t chunk_state_output(const blake3_chunk_state_t *ctx) +{ + uint8_t block_flags = + ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END; + return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter, + block_flags)); +} + +static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) +{ + return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT)); +} + +/* + * Given some input larger than one chunk, return the number of bytes that + * should go in the left subtree. This is the largest power-of-2 number of + * chunks that leaves at least 1 byte for the right subtree. + */ +static size_t left_len(size_t content_len) +{ + /* + * Subtract 1 to reserve at least one byte for the right side. + * content_len + * should always be greater than BLAKE3_CHUNK_LEN. + */ + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN); +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time + * on a single thread. Write out the chunk chaining values and return the + * number of chunks hashed. These chunks are never the root and never empty; + * those cases use a different codepath. + */ +static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / + BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START, + CHUNK_END, out); + + /* + * Hash the remaining partial chunk, if there is one. Note that the + * empty chunk (meaning the empty message) is a different codepath. + */ + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(ops, &chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(ops, &output, &out[chunks_array_len * + BLAKE3_OUT_LEN]); + return (chunks_array_len + 1); + } else { + return (chunks_array_len); + } +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time + * on a single thread. Write out the parent chaining values and return the + * number of parents hashed. (If there's an odd input chaining value left over, + * return it as an additional output.) These parents are never the root and + * never empty; those cases use a different codepath. + */ +static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, + const uint8_t *child_chaining_values, size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, uint8_t *out) +{ + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = &child_chaining_values[2 * + parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE, + flags | PARENT, 0, 0, out); + + /* If there's an odd child left over, it becomes an output. */ + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * + BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); + return (parents_array_len + 1); + } else { + return (parents_array_len); + } +} + +/* + * The wide helper function returns (writes out) an array of chaining values + * and returns the length of that array. The number of chaining values returned + * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, + * if the input is shorter than that many chunks. The reason for maintaining a + * wide array of chaining values going back up the tree, is to allow the + * implementation to hash as many parents in parallel as possible. + * + * As a special case when the SIMD degree is 1, this function will still return + * at least 2 outputs. This guarantees that this function doesn't perform the + * root compression. (If it did, it would use the wrong flags, and also we + * wouldn't be able to implement exendable ouput.) Note that this function is + * not used when the whole input is only 1 chunk long; that's a different + * codepath. + * + * Why not just have the caller split the input on the first update(), instead + * of implementing this special rule? Because we don't want to limit SIMD or + * multi-threading parallelism for that update(). + */ +static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + /* + * Note that the single chunk case does *not* bump the SIMD degree up + * to 2 when it is 1. If this implementation adds multi-threading in + * the future, this gives us the option of multi-threading even the + * 2-chunk case, which can help performance on smaller platforms. + */ + if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) { + return (compress_chunks_parallel(ops, input, input_len, key, + chunk_counter, flags, out)); + } + + + /* + * With more than simd_degree chunks, we need to recurse. Start by + * dividing the input into left and right subtrees. (Note that this is + * only optimal as long as the SIMD degree is a power of 2. If we ever + * get a SIMD degree of 3 or something, we'll need a more complicated + * strategy.) + */ + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = chunk_counter + + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + /* + * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 + * to account for the special case of returning 2 outputs when the + * SIMD degree is 1. + */ + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = ops->degree; + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + + /* + * The special case: We always use a degree of at least two, + * to make sure there are two outputs. Except, as noted above, + * at the chunk level, where we allow degree=1. (Note that the + * 1-chunk-input case is a different codepath.) + */ + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + /* + * Recurse! If this implementation adds multi-threading support in the + * future, this is where it will go. + */ + size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len, + key, chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide(ops, right_input, + right_input_len, key, right_chunk_counter, flags, right_cvs); + + /* + * The special case again. If simd_degree=1, then we'll have left_n=1 + * and right_n=1. Rather than compressing them into a single output, + * return them directly, to make sure we always have at least two + * outputs. + */ + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return (2); + } + + /* Otherwise, do one layer of parent node compression. */ + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(ops, cv_array, + num_chaining_values, key, flags, out); +} + +/* + * Hash a subtree with compress_subtree_wide(), and then condense the resulting + * list of chaining values down to a single parent node. Don't compress that + * last parent node, however. Instead, return its message bytes (the + * concatenated chaining values of its children). This is necessary when the + * first call to update() supplies a complete subtree, because the topmost + * parent node of that subtree could end up being the root. It's also necessary + * for extended output in the general case. + * + * As with compress_subtree_wide(), this function is not used on inputs of 1 + * chunk or less. That's a different codepath. + */ +static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len, + key, chunk_counter, flags, cv_array); + + /* + * If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + * compress_subtree_wide() returns more than 2 chaining values. Condense + * them into 2 by forming parent nodes repeatedly. + */ + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key, + flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], + uint8_t flags) +{ + memcpy(ctx->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&ctx->chunk, key, flags); + ctx->cv_stack_len = 0; + ctx->ops = blake3_impl_get_ops(); +} + +/* + * As described in hasher_push_cv() below, we do "lazy merging", delaying + * merges until right before the next CV is about to be added. This is + * different from the reference implementation. Another difference is that we + * aren't always merging 1 chunk at a time. Instead, each CV might represent + * any power-of-two number of chunks, as long as the smaller-above-larger + * stack order is maintained. Instead of the "count the trailing 0-bits" + * algorithm described in the spec, we use a "count the total number of + * 1-bits" variant that doesn't require us to retain the subtree size of the + * CV on top of the stack. The principle is the same: each CV that should + * remain in the stack is represented by a 1-bit in the total number of chunks + * (or bytes) so far. + */ +static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len) +{ + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (ctx->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = + parent_output(parent_node, ctx->key, ctx->chunk.flags); + output_chaining_value(ctx->ops, &output, parent_node); + ctx->cv_stack_len -= 1; + } +} + +/* + * In reference_impl.rs, we merge the new CV with existing CVs from the stack + * before pushing it. We can do that because we know more input is coming, so + * we know none of the merges are root. + * + * This setting is different. We want to feed as much input as possible to + * compress_subtree_wide(), without setting aside anything for the chunk_state. + * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once + * as a single subtree, if at all possible. + * + * This leads to two problems: + * 1) This 64 KiB input might be the only call that ever gets made to update. + * In this case, the root node of the 64 KiB subtree would be the root node + * of the whole tree, and it would need to be ROOT finalized. We can't + * compress it until we know. + * 2) This 64 KiB input might complete a larger tree, whose root node is + * similarly going to be the the root of the whole tree. For example, maybe + * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the + * node at the root of the 256 KiB subtree until we know how to finalize it. + * + * The second problem is solved with "lazy merging". That is, when we're about + * to add a CV to the stack, we don't merge it with anything first, as the + * reference impl does. Instead we do merges using the *previous* CV that was + * added, which is sitting on top of the stack, and we put the new CV + * (unmerged) on top of the stack afterwards. This guarantees that we never + * merge the root node until finalize(). + * + * Solving the first problem requires an additional tool, + * compress_subtree_to_parent_node(). That function always returns the top + * *two* chaining values of the subtree it's compressing. We then do lazy + * merging with each of them separately, so that the second CV will always + * remain unmerged. (That also helps us support extendable output when we're + * hashing an input all-at-once.) + */ +static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) +{ + hasher_merge_cv_stack(ctx, chunk_counter); + memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + ctx->cv_stack_len += 1; +} + +void +Blake3_Init(BLAKE3_CTX *ctx) +{ + hasher_init_base(ctx, BLAKE3_IV, 0); +} + +void +Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]) +{ + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(ctx, key_words, KEYED_HASH); +} + +static void +Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector v; + * blake3_hasher_update(&hasher, v.data(), v.size()); + */ + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + /* + * If we have some partial chunk bytes in the internal chunk_state, we + * need to finish that chunk first. + */ + if (chunk_state_len(&ctx->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + /* + * If we've filled the current chunk and there's more coming, + * finalize this chunk and proceed. In this case we know it's + * not the root. + */ + if (input_len > 0) { + output_t output = chunk_state_output(&ctx->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(ctx->ops, &output, chunk_cv); + hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter); + chunk_state_reset(&ctx->chunk, ctx->key, + ctx->chunk.chunk_counter + 1); + } else { + return; + } + } + + /* + * Now the chunk_state is clear, and we have more input. If there's + * more than a single chunk (so, definitely not the root chunk), hash + * the largest whole subtree we can, with the full benefits of SIMD + * (and maybe in the future, multi-threading) parallelism. Two + * restrictions: + * - The subtree has to be a power-of-2 number of chunks. Only + * subtrees along the right edge can be incomplete, and we don't know + * where the right edge is going to be until we get to finalize(). + * - The subtree must evenly divide the total number of chunks up + * until this point (if total is not 0). If the current incomplete + * subtree is only waiting for 1 more chunk, we can't hash a subtree + * of 4 chunks. We have to complete the current subtree first. + * Because we might need to break up the input to form powers of 2, or + * to evenly divide what we already have, this part runs in a loop. + */ + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = + ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + /* + * Shrink the subtree_len until it evenly divides the count so + * far. We know that subtree_len itself is a power of 2, so we + * can use a bitmasking trick instead of an actual remainder + * operation. (Note that if the caller consistently passes + * power-of-2 inputs of the same size, as is hopefully + * typical, this loop condition will always fail, and + * subtree_len will always be the full length of the input.) + * + * An aside: We don't have to shrink subtree_len quite this + * much. For example, if count_so_far is 1, we could pass 2 + * chunks to compress_subtree_to_parent_node. Since we'll get + * 2 CVs back, we'll still get the right answer in the end, + * and we might get to use 2-way SIMD parallelism. The problem + * with this optimization, is that it gets us stuck always + * hashing 2 chunks. The total number of chunks will remain + * odd, and we'll never graduate to higher degrees of + * parallelism. See + * https://github.com/BLAKE3-team/BLAKE3/issues/69. + */ + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + /* + * The shrunken subtree_len might now be 1 chunk long. If so, + * hash that one chunk by itself. Otherwise, compress the + * subtree into a pair of CVs. + */ + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, ctx->key, + ctx->chunk.flags); + chunk_state.chunk_counter = ctx->chunk.chunk_counter; + chunk_state_update(ctx->ops, &chunk_state, input_bytes, + subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(ctx->ops, &output, cv); + hasher_push_cv(ctx, cv, chunk_state.chunk_counter); + } else { + /* + * This is the high-performance happy path, though + * getting here depends on the caller giving us a long + * enough input. + */ + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(ctx->ops, input_bytes, + subtree_len, ctx->key, ctx-> chunk.chunk_counter, + ctx->chunk.flags, cv_pair); + hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter); + hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN], + ctx->chunk.chunk_counter + (subtree_chunks / 2)); + } + ctx->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + /* + * If there's any remaining input less than a full chunk, add it to + * the chunk state. In that case, also do a final merge loop to make + * sure the subtree stack doesn't contain any unmerged pairs. The + * remaining input means we know these merges are non-root. This merge + * loop isn't strictly necessary here, because hasher_push_chunk_cv + * already does its own merge loop, but it simplifies + * blake3_hasher_finalize below. + */ + if (input_len > 0) { + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, + input_len); + hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter); + } +} + +void +Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo) +{ + size_t done = 0; + const uint8_t *data = input; + const size_t block_max = 1024 * 64; + + /* max feed buffer to leave the stack size small */ + while (todo != 0) { + size_t block = (todo >= block_max) ? block_max : todo; + Blake3_Update2(ctx, data + done, block); + done += block; + todo -= block; + } +} + +void +Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out) +{ + Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN); +} + +void +Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, + size_t out_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector v; + * blake3_hasher_finalize(&hasher, v.data(), v.size()); + */ + if (out_len == 0) { + return; + } + /* If the subtree stack is empty, then the current chunk is the root. */ + if (ctx->cv_stack_len == 0) { + output_t output = chunk_state_output(&ctx->chunk); + output_root_bytes(ctx->ops, &output, seek, out, out_len); + return; + } + /* + * If there are any bytes in the chunk state, finalize that chunk and + * do a roll-up merge between that chunk hash and every subtree in the + * stack. In this case, the extra merge loop at the end of + * blake3_hasher_update guarantees that none of the subtrees in the + * stack need to be merged with each other first. Otherwise, if there + * are no bytes in the chunk state, then the top of the stack is a + * chunk hash, and we start the merge from that. + */ + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&ctx->chunk) > 0) { + cvs_remaining = ctx->cv_stack_len; + output = chunk_state_output(&ctx->chunk); + } else { + /* There are always at least 2 CVs in the stack in this case. */ + cvs_remaining = ctx->cv_stack_len - 2; + output = parent_output(&ctx->cv_stack[cvs_remaining * 32], + ctx->key, ctx->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(ctx->ops, &output, &parent_block[32]); + output = parent_output(parent_block, ctx->key, + ctx->chunk.flags); + } + output_root_bytes(ctx->ops, &output, seek, out, out_len); +} diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c new file mode 100644 index 000000000..6ff9a845c --- /dev/null +++ b/module/icp/algs/blake3/blake3_generic.c @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include +#include "blake3_impl.h" + +#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) +{ + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +static inline void round_fn(uint32_t state[16], const uint32_t *msg, + size_t round) +{ + /* Select the message schedule based on the round. */ + const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round]; + + /* Mix the columns. */ + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + /* Mix the rows. */ + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +static inline void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = BLAKE3_IV[0]; + state[9] = BLAKE3_IV[1]; + state[10] = BLAKE3_IV[2]; + state[11] = BLAKE3_IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +static inline void blake3_compress_in_place_generic(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +static inline void hash_one_generic(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) +{ + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN, + counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +static inline void blake3_compress_xof_generic(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +static inline void blake3_hash_many_generic(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, + boolean_t increment_counter, uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) +{ + while (num_inputs > 0) { + hash_one_generic(inputs[0], blocks, key, counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + +static inline boolean_t blake3_is_generic_supported(void) +{ + return (B_TRUE); +} + +const blake3_impl_ops_t blake3_generic_impl = { + .compress_in_place = blake3_compress_in_place_generic, + .compress_xof = blake3_compress_xof_generic, + .hash_many = blake3_hash_many_generic, + .is_supported = blake3_is_generic_supported, + .degree = 4, + .name = "generic" +}; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c new file mode 100644 index 000000000..c3268ec13 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.c @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include +#include + +#include "blake3_impl.h" + +static const blake3_impl_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; + +/* this pointer holds current ops for implementation */ +static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; + +/* special implementation selections */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) +#define IMPL_USER (UINT32_MAX-2) +#define IMPL_PARAM (UINT32_MAX-3) + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) +static uint32_t icp_blake3_impl = IMPL_FASTEST; + +#define BLAKE3_IMPL_NAME_MAX 16 + +/* id of fastest implementation */ +static uint32_t blake3_fastest_id = 0; + +/* currently used id */ +static uint32_t blake3_current_id = 0; + +/* id of module parameter (-1 == unused) */ +static int blake3_param_id = -1; + +/* return number of supported implementations */ +int +blake3_get_impl_count(void) +{ + static int impls = 0; + int i; + + if (impls) + return (impls); + + for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + impls++; + } + + return (impls); +} + +/* return id of selected implementation */ +int +blake3_get_impl_id(void) +{ + return (blake3_current_id); +} + +/* return name of selected implementation */ +const char * +blake3_get_impl_name(void) +{ + return (blake3_selected_impl->name); +} + +/* setup id as fastest implementation */ +void +blake3_set_impl_fastest(uint32_t id) +{ + blake3_fastest_id = id; +} + +/* set implementation by id */ +void +blake3_set_impl_id(uint32_t id) +{ + int i, cid; + + /* select fastest */ + if (id == IMPL_FASTEST) + id = blake3_fastest_id; + + /* select next or first */ + if (id == IMPL_CYCLE) + id = (++blake3_current_id) % blake3_get_impl_count(); + + /* 0..N for the real impl */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (cid == id) { + blake3_current_id = cid; + blake3_selected_impl = blake3_impls[i]; + return; + } + cid++; + } +} + +/* set implementation by name */ +int +blake3_set_impl_name(const char *name) +{ + int i, cid; + + if (strcmp(name, "fastest") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); + blake3_set_impl_id(IMPL_FASTEST); + return (0); + } else if (strcmp(name, "cycle") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); + blake3_set_impl_id(IMPL_CYCLE); + return (0); + } + + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (strcmp(name, blake3_impls[i]->name) == 0) { + if (icp_blake3_impl == IMPL_PARAM) { + blake3_param_id = cid; + return (0); + } + blake3_selected_impl = blake3_impls[i]; + blake3_current_id = cid; + return (0); + } + cid++; + } + + return (-EINVAL); +} + +/* setup implementation */ +void +blake3_setup_impl(void) +{ + switch (IMPL_READ(icp_blake3_impl)) { + case IMPL_PARAM: + blake3_set_impl_id(blake3_param_id); + atomic_swap_32(&icp_blake3_impl, IMPL_USER); + break; + case IMPL_FASTEST: + blake3_set_impl_id(IMPL_FASTEST); + break; + case IMPL_CYCLE: + blake3_set_impl_id(IMPL_CYCLE); + break; + default: + blake3_set_impl_id(blake3_current_id); + break; + } +} + +/* return selected implementation */ +const blake3_impl_ops_t * +blake3_impl_get_ops(void) +{ + /* each call to ops will cycle */ + if (icp_blake3_impl == IMPL_CYCLE) + blake3_set_impl_id(IMPL_CYCLE); + + return (blake3_selected_impl); +} + +#if defined(_KERNEL) && defined(__linux__) +static int +icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) +{ + char req_name[BLAKE3_IMPL_NAME_MAX]; + size_t i; + + /* sanitize input */ + i = strnlen(name, BLAKE3_IMPL_NAME_MAX); + if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) + return (-EINVAL); + + strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); + while (i > 0 && isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); + return (blake3_set_impl_name(req_name)); +} + +static int +icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cid, cnt = 0; + char *fmt; + + /* cycling */ + fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; + cnt += sprintf(buffer + cnt, fmt); + + /* fastest one */ + fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; + cnt += sprintf(buffer + cnt, fmt); + + /* user selected */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + fmt = (icp_blake3_impl == IMPL_USER && + cid == blake3_current_id) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); + cid++; + } + + buffer[cnt] = 0; + + return (cnt); +} + +module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, + NULL, 0644); +MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +#endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h new file mode 100644 index 000000000..7b40cc4d3 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* + * Methods used to define BLAKE3 assembler implementations + */ +typedef void (*blake3_compress_in_place_f)(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +typedef void (*blake3_compress_xof_f)(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +typedef boolean_t (*blake3_is_supported_f)(void); + +typedef struct blake3_impl_ops { + blake3_compress_in_place_f compress_in_place; + blake3_compress_xof_f compress_xof; + blake3_hash_many_f hash_many; + blake3_is_supported_f is_supported; + int degree; + const char *name; +} blake3_impl_ops_t; + +/* Return selected BLAKE3 implementation ops */ +extern const blake3_impl_ops_t *blake3_impl_get_ops(void); + +extern const blake3_impl_ops_t blake3_generic_impl; + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse2_impl; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse41_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern const blake3_impl_ops_t blake3_avx2_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern const blake3_impl_ops_t blake3_avx512_impl; +#endif + +#if defined(__x86_64) +#define MAX_SIMD_DEGREE 16 +#else +#define MAX_SIMD_DEGREE 4 +#endif + +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t BLAKE3_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +static inline unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (63 ^ __builtin_clzll(x)); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return (index); +#elif defined(_MSC_VER) && defined(IS_X86_32) + if (x >> 32) { + unsigned long index; + _BitScanReverse(&index, x >> 32); + return (32 + index); + } else { + unsigned long index; + _BitScanReverse(&index, x); + return (index); + } +#else + unsigned int c = 0; + if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if (x & 0x0000000000000002ULL) { c += 1; } + return (c); +#endif +} + +/* Count the number of 1 bits. */ +static inline unsigned int popcnt(uint64_t x) { + unsigned int count = 0; + + while (x != 0) { + count += 1; + x &= x - 1; + } + + return (count); +} + +/* + * Largest power of two less than or equal to x. + * As a special case, returns 1 when x is 0. + */ +static inline uint64_t round_down_to_power_of_2(uint64_t x) { + return (1ULL << highest_one(x | 1)); +} + +static inline uint32_t counter_low(uint64_t counter) { + return ((uint32_t)counter); +} + +static inline uint32_t counter_high(uint64_t counter) { + return ((uint32_t)(counter >> 32)); +} + +static inline uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +static inline void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_IMPL_H */ diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c new file mode 100644 index 000000000..48715e212 --- /dev/null +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include "blake3_impl.h" + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse2_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse41_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx2_supported(void) +{ + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); +} + +const blake3_impl_ops_t blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx512_supported(void) +{ + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); +} + +const blake3_impl_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif -- cgit v1.2.3