diff options
author | Brian Behlendorf <[email protected]> | 2016-10-04 11:20:38 -0700 |
---|---|---|
committer | GitHub <[email protected]> | 2016-10-04 11:20:38 -0700 |
commit | 5cc78dc81232bc474d25ccfcacb42d80d83c5310 (patch) | |
tree | 7dfaccfc85d5e687a7334450f3dbf16865bf3995 /module/icp | |
parent | 0c313d2f7451041e9cc952fe68fb500efef52fe1 (diff) | |
parent | 7d75815dc950bdce3fd03cc40a3352d93c270e0f (diff) |
Merge OpenZFS 4185
OpenZFS 4185 - add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R
Reviewed-by: Chunwei Chen <[email protected]>
Reviewed-by: Tom Caputi <[email protected]>
Reviewed-by: David Quigley <[email protected]>
Reviewed-by: Brian Behlendorf <[email protected]>
Signed-off-by: Tony Hutter <[email protected]>
Closes #4760
Diffstat (limited to 'module/icp')
-rw-r--r-- | module/icp/Makefile.in | 10 | ||||
-rw-r--r-- | module/icp/algs/edonr/edonr.c | 751 | ||||
-rw-r--r-- | module/icp/algs/edonr/edonr_byteorder.h | 216 | ||||
-rw-r--r-- | module/icp/algs/sha2/sha2.c | 477 | ||||
-rw-r--r-- | module/icp/algs/skein/THIRDPARTYLICENSE | 3 | ||||
-rw-r--r-- | module/icp/algs/skein/THIRDPARTYLICENSE.descrip | 1 | ||||
-rw-r--r-- | module/icp/algs/skein/skein.c | 921 | ||||
-rw-r--r-- | module/icp/algs/skein/skein_block.c | 793 | ||||
-rw-r--r-- | module/icp/algs/skein/skein_impl.h | 289 | ||||
-rw-r--r-- | module/icp/algs/skein/skein_iv.c | 185 | ||||
-rw-r--r-- | module/icp/algs/skein/skein_port.h | 128 | ||||
-rw-r--r-- | module/icp/asm-x86_64/sha2/sha256_impl.S | 8 | ||||
-rw-r--r-- | module/icp/asm-x86_64/sha2/sha512_impl.S | 2083 | ||||
-rw-r--r-- | module/icp/illumos-crypto.c | 4 | ||||
-rw-r--r-- | module/icp/include/sha2/sha2.h | 116 | ||||
-rw-r--r-- | module/icp/include/sha2/sha2_impl.h | 2 | ||||
-rw-r--r-- | module/icp/io/edonr_mod.c | 62 | ||||
-rw-r--r-- | module/icp/io/sha2_mod.c | 2 | ||||
-rw-r--r-- | module/icp/io/skein_mod.c | 721 |
19 files changed, 6644 insertions, 128 deletions
diff --git a/module/icp/Makefile.in b/module/icp/Makefile.in index 4be03dbae..b822635b7 100644 --- a/module/icp/Makefile.in +++ b/module/icp/Makefile.in @@ -12,6 +12,7 @@ ASM_SOURCES += asm-x86_64/aes/aes_intel.o ASM_SOURCES += asm-x86_64/modes/gcm_intel.o ASM_SOURCES += asm-x86_64/sha1/sha1-x86_64.o ASM_SOURCES += asm-x86_64/sha2/sha256_impl.o +ASM_SOURCES += asm-x86_64/sha2/sha512_impl.o endif ifeq ($(TARGET_ASM_DIR), asm-i386) @@ -43,8 +44,10 @@ $(MODULE)-objs += core/kcf_mech_tabs.o $(MODULE)-objs += core/kcf_prov_lib.o $(MODULE)-objs += spi/kcf_spi.o $(MODULE)-objs += io/aes.o +$(MODULE)-objs += io/edonr_mod.o $(MODULE)-objs += io/sha1_mod.o $(MODULE)-objs += io/sha2_mod.o +$(MODULE)-objs += io/skein_mod.o $(MODULE)-objs += os/modhash.o $(MODULE)-objs += os/modconf.o $(MODULE)-objs += algs/modes/cbc.o @@ -55,8 +58,13 @@ $(MODULE)-objs += algs/modes/gcm.o $(MODULE)-objs += algs/modes/modes.o $(MODULE)-objs += algs/aes/aes_impl.o $(MODULE)-objs += algs/aes/aes_modes.o +$(MODULE)-objs += algs/edonr/edonr.o $(MODULE)-objs += algs/sha1/sha1.o $(MODULE)-objs += algs/sha2/sha2.o +$(MODULE)-objs += algs/sha1/sha1.o +$(MODULE)-objs += algs/skein/skein.o +$(MODULE)-objs += algs/skein/skein_block.o +$(MODULE)-objs += algs/skein/skein_iv.o $(MODULE)-objs += $(ASM_SOURCES) ICP_DIRS = \ @@ -67,9 +75,11 @@ ICP_DIRS = \ os \ algs \ algs/aes \ + algs/edonr \ algs/modes \ algs/sha1 \ algs/sha2 \ + algs/skein \ asm-x86_64 \ asm-x86_64/aes \ asm-x86_64/modes \ diff --git a/module/icp/algs/edonr/edonr.c b/module/icp/algs/edonr/edonr.c new file mode 100644 index 000000000..8ae989890 --- /dev/null +++ b/module/icp/algs/edonr/edonr.c @@ -0,0 +1,751 @@ +/* + * IDI,NTNU + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]> + * Tweaked Edon-R implementation for SUPERCOP, based on NIST API. + * + * $Id: edonr.c 517 2013-02-17 20:34:39Z joern $ + */ +/* + * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved + */ + +/* determine where we can get bcopy/bzero declarations */ +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <strings.h> +#endif +#include <sys/edonr.h> +#include <sys/debug.h> + +/* big endian support, provides no-op's if run on little endian hosts */ +#include "edonr_byteorder.h" + +#define hashState224(x) ((x)->pipe->p256) +#define hashState256(x) ((x)->pipe->p256) +#define hashState384(x) ((x)->pipe->p512) +#define hashState512(x) ((x)->pipe->p512) + +/* shift and rotate shortcuts */ +#define shl(x, n) ((x) << n) +#define shr(x, n) ((x) >> n) + +#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) + +#define rotl64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#define rotr64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + +#if !defined(__C99_RESTRICT) +#define restrict /* restrict */ +#endif + +#define EDONR_VALID_HASHBITLEN(x) \ + ((x) == 512 || (x) == 384 || (x) == 256 || (x) == 224) + +/* EdonR224 initial double chaining pipe */ +static const uint32_t i224p2[16] = { + 0x00010203ul, 0x04050607ul, 0x08090a0bul, 0x0c0d0e0ful, + 0x10111213ul, 0x14151617ul, 0x18191a1bul, 0x1c1d1e1ful, + 0x20212223ul, 0x24252627ul, 0x28292a2bul, 0x2c2d2e2ful, + 0x30313233ul, 0x34353637ul, 0x38393a3bul, 0x3c3d3e3ful, +}; + +/* EdonR256 initial double chaining pipe */ +static const uint32_t i256p2[16] = { + 0x40414243ul, 0x44454647ul, 0x48494a4bul, 0x4c4d4e4ful, + 0x50515253ul, 0x54555657ul, 0x58595a5bul, 0x5c5d5e5ful, + 0x60616263ul, 0x64656667ul, 0x68696a6bul, 0x6c6d6e6ful, + 0x70717273ul, 0x74757677ul, 0x78797a7bul, 0x7c7d7e7ful, +}; + +/* EdonR384 initial double chaining pipe */ +static const uint64_t i384p2[16] = { + 0x0001020304050607ull, 0x08090a0b0c0d0e0full, + 0x1011121314151617ull, 0x18191a1b1c1d1e1full, + 0x2021222324252627ull, 0x28292a2b2c2d2e2full, + 0x3031323334353637ull, 0x38393a3b3c3d3e3full, + 0x4041424344454647ull, 0x48494a4b4c4d4e4full, + 0x5051525354555657ull, 0x58595a5b5c5d5e5full, + 0x6061626364656667ull, 0x68696a6b6c6d6e6full, + 0x7071727374757677ull, 0x78797a7b7c7d7e7full +}; + +/* EdonR512 initial double chaining pipe */ +static const uint64_t i512p2[16] = { + 0x8081828384858687ull, 0x88898a8b8c8d8e8full, + 0x9091929394959697ull, 0x98999a9b9c9d9e9full, + 0xa0a1a2a3a4a5a6a7ull, 0xa8a9aaabacadaeafull, + 0xb0b1b2b3b4b5b6b7ull, 0xb8b9babbbcbdbebfull, + 0xc0c1c2c3c4c5c6c7ull, 0xc8c9cacbcccdcecfull, + 0xd0d1d2d3d4d5d6d7ull, 0xd8d9dadbdcdddedfull, + 0xe0e1e2e3e4e5e6e7ull, 0xe8e9eaebecedeeefull, + 0xf0f1f2f3f4f5f6f7ull, 0xf8f9fafbfcfdfeffull +}; + +/* + * First Latin Square + * 0 7 1 3 2 4 6 5 + * 4 1 7 6 3 0 5 2 + * 7 0 4 2 5 3 1 6 + * 1 4 0 5 6 2 7 3 + * 2 3 6 7 1 5 0 4 + * 5 2 3 1 7 6 4 0 + * 3 6 5 0 4 7 2 1 + * 6 5 2 4 0 1 3 7 + */ +#define LS1_256(c, x0, x1, x2, x3, x4, x5, x6, x7) \ +{ \ + uint32_t x04, x17, x23, x56, x07, x26; \ + x04 = x0+x4, x17 = x1+x7, x07 = x04+x17; \ + s0 = c + x07 + x2; \ + s1 = rotl32(x07 + x3, 4); \ + s2 = rotl32(x07 + x6, 8); \ + x23 = x2 + x3; \ + s5 = rotl32(x04 + x23 + x5, 22); \ + x56 = x5 + x6; \ + s6 = rotl32(x17 + x56 + x0, 24); \ + x26 = x23+x56; \ + s3 = rotl32(x26 + x7, 13); \ + s4 = rotl32(x26 + x1, 17); \ + s7 = rotl32(x26 + x4, 29); \ +} + +#define LS1_512(c, x0, x1, x2, x3, x4, x5, x6, x7) \ +{ \ + uint64_t x04, x17, x23, x56, x07, x26; \ + x04 = x0+x4, x17 = x1+x7, x07 = x04+x17; \ + s0 = c + x07 + x2; \ + s1 = rotl64(x07 + x3, 5); \ + s2 = rotl64(x07 + x6, 15); \ + x23 = x2 + x3; \ + s5 = rotl64(x04 + x23 + x5, 40); \ + x56 = x5 + x6; \ + s6 = rotl64(x17 + x56 + x0, 50); \ + x26 = x23+x56; \ + s3 = rotl64(x26 + x7, 22); \ + s4 = rotl64(x26 + x1, 31); \ + s7 = rotl64(x26 + x4, 59); \ +} + +/* + * Second Orthogonal Latin Square + * 0 4 2 3 1 6 5 7 + * 7 6 3 2 5 4 1 0 + * 5 3 1 6 0 2 7 4 + * 1 0 5 4 3 7 2 6 + * 2 1 0 7 4 5 6 3 + * 3 5 7 0 6 1 4 2 + * 4 7 6 1 2 0 3 5 + * 6 2 4 5 7 3 0 1 + */ +#define LS2_256(c, y0, y1, y2, y3, y4, y5, y6, y7) \ +{ \ + uint32_t y01, y25, y34, y67, y04, y05, y27, y37; \ + y01 = y0+y1, y25 = y2+y5, y05 = y01+y25; \ + t0 = ~c + y05 + y7; \ + t2 = rotl32(y05 + y3, 9); \ + y34 = y3+y4, y04 = y01+y34; \ + t1 = rotl32(y04 + y6, 5); \ + t4 = rotl32(y04 + y5, 15); \ + y67 = y6+y7, y37 = y34+y67; \ + t3 = rotl32(y37 + y2, 11); \ + t7 = rotl32(y37 + y0, 27); \ + y27 = y25+y67; \ + t5 = rotl32(y27 + y4, 20); \ + t6 = rotl32(y27 + y1, 25); \ +} + +#define LS2_512(c, y0, y1, y2, y3, y4, y5, y6, y7) \ +{ \ + uint64_t y01, y25, y34, y67, y04, y05, y27, y37; \ + y01 = y0+y1, y25 = y2+y5, y05 = y01+y25; \ + t0 = ~c + y05 + y7; \ + t2 = rotl64(y05 + y3, 19); \ + y34 = y3+y4, y04 = y01+y34; \ + t1 = rotl64(y04 + y6, 10); \ + t4 = rotl64(y04 + y5, 36); \ + y67 = y6+y7, y37 = y34+y67; \ + t3 = rotl64(y37 + y2, 29); \ + t7 = rotl64(y37 + y0, 55); \ + y27 = y25+y67; \ + t5 = rotl64(y27 + y4, 44); \ + t6 = rotl64(y27 + y1, 48); \ +} + +#define quasi_exform256(r0, r1, r2, r3, r4, r5, r6, r7) \ +{ \ + uint32_t s04, s17, s23, s56, t01, t25, t34, t67; \ + s04 = s0 ^ s4, t01 = t0 ^ t1; \ + r0 = (s04 ^ s1) + (t01 ^ t5); \ + t67 = t6 ^ t7; \ + r1 = (s04 ^ s7) + (t2 ^ t67); \ + s23 = s2 ^ s3; \ + r7 = (s23 ^ s5) + (t4 ^ t67); \ + t34 = t3 ^ t4; \ + r3 = (s23 ^ s4) + (t0 ^ t34); \ + s56 = s5 ^ s6; \ + r5 = (s3 ^ s56) + (t34 ^ t6); \ + t25 = t2 ^ t5; \ + r6 = (s2 ^ s56) + (t25 ^ t7); \ + s17 = s1 ^ s7; \ + r4 = (s0 ^ s17) + (t1 ^ t25); \ + r2 = (s17 ^ s6) + (t01 ^ t3); \ +} + +#define quasi_exform512(r0, r1, r2, r3, r4, r5, r6, r7) \ +{ \ + uint64_t s04, s17, s23, s56, t01, t25, t34, t67; \ + s04 = s0 ^ s4, t01 = t0 ^ t1; \ + r0 = (s04 ^ s1) + (t01 ^ t5); \ + t67 = t6 ^ t7; \ + r1 = (s04 ^ s7) + (t2 ^ t67); \ + s23 = s2 ^ s3; \ + r7 = (s23 ^ s5) + (t4 ^ t67); \ + t34 = t3 ^ t4; \ + r3 = (s23 ^ s4) + (t0 ^ t34); \ + s56 = s5 ^ s6; \ + r5 = (s3 ^ s56) + (t34 ^ t6); \ + t25 = t2 ^ t5; \ + r6 = (s2 ^ s56) + (t25 ^ t7); \ + s17 = s1 ^ s7; \ + r4 = (s0 ^ s17) + (t1 ^ t25); \ + r2 = (s17 ^ s6) + (t01 ^ t3); \ +} + +static size_t +Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p) +{ + size_t bl; + + for (bl = bitlen; bl >= EdonR256_BLOCK_BITSIZE; + bl -= EdonR256_BLOCK_BITSIZE, data += 16) { + uint32_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, + t5, t6, t7; + uint32_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4, + q5, q6, q7; + const uint32_t defix = 0xaaaaaaaa; +#if defined(MACHINE_IS_BIG_ENDIAN) + uint32_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8, + swp9, swp10, swp11, swp12, swp13, swp14, swp15; +#define d(j) swp ## j +#define s32(j) ld_swap32((uint32_t *)data + j, swp ## j) +#else +#define d(j) data[j] +#endif + + /* First row of quasigroup e-transformations */ +#if defined(MACHINE_IS_BIG_ENDIAN) + s32(8); + s32(9); + s32(10); + s32(11); + s32(12); + s32(13); + s32(14); + s32(15); +#endif + LS1_256(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9), + d(8)); +#if defined(MACHINE_IS_BIG_ENDIAN) + s32(0); + s32(1); + s32(2); + s32(3); + s32(4); + s32(5); + s32(6); + s32(7); +#undef s32 +#endif + LS2_256(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7)); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14), + d(15)); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Second row of quasigroup e-transformations */ + LS1_256(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14], + p[15]); + LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Third row of quasigroup e-transformations */ + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, q0, q1, q2, q3, q4, q5, q6, q7); + LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Fourth row of quasigroup e-transformations */ + LS1_256(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0)); + LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Edon-R tweak on the original SHA-3 Edon-R submission. */ + p[0] ^= d(8) ^ p0; + p[1] ^= d(9) ^ p1; + p[2] ^= d(10) ^ p2; + p[3] ^= d(11) ^ p3; + p[4] ^= d(12) ^ p4; + p[5] ^= d(13) ^ p5; + p[6] ^= d(14) ^ p6; + p[7] ^= d(15) ^ p7; + p[8] ^= d(0) ^ q0; + p[9] ^= d(1) ^ q1; + p[10] ^= d(2) ^ q2; + p[11] ^= d(3) ^ q3; + p[12] ^= d(4) ^ q4; + p[13] ^= d(5) ^ q5; + p[14] ^= d(6) ^ q6; + p[15] ^= d(7) ^ q7; + } + +#undef d + return (bitlen - bl); +} + +/* + * Why is this #pragma here? + * + * Checksum functions like this one can go over the stack frame size check + * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024). We can + * safely ignore the compiler error since we know that in ZoL, that + * the function will be called from a worker thread that won't be using + * much stack. The only function that goes over the 1k limit is Q512(), + * which only goes over it by a hair (1248 bytes on ARM32). + */ +#include <sys/isa_defs.h> /* for _ILP32 */ +#ifdef _ILP32 /* We're 32-bit, assume small stack frames */ +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__) +static inline size_t +#else +static size_t +#endif +Q512(size_t bitlen, const uint64_t *data, uint64_t *restrict p) +{ + size_t bl; + + for (bl = bitlen; bl >= EdonR512_BLOCK_BITSIZE; + bl -= EdonR512_BLOCK_BITSIZE, data += 16) { + uint64_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, + t5, t6, t7; + uint64_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4, + q5, q6, q7; + const uint64_t defix = 0xaaaaaaaaaaaaaaaaull; +#if defined(MACHINE_IS_BIG_ENDIAN) + uint64_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8, + swp9, swp10, swp11, swp12, swp13, swp14, swp15; +#define d(j) swp##j +#define s64(j) ld_swap64((uint64_t *)data+j, swp##j) +#else +#define d(j) data[j] +#endif + + /* First row of quasigroup e-transformations */ +#if defined(MACHINE_IS_BIG_ENDIAN) + s64(8); + s64(9); + s64(10); + s64(11); + s64(12); + s64(13); + s64(14); + s64(15); +#endif + LS1_512(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9), + d(8)); +#if defined(MACHINE_IS_BIG_ENDIAN) + s64(0); + s64(1); + s64(2); + s64(3); + s64(4); + s64(5); + s64(6); + s64(7); +#undef s64 +#endif + LS2_512(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7)); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14), + d(15)); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Second row of quasigroup e-transformations */ + LS1_512(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14], + p[15]); + LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Third row of quasigroup e-transformations */ + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, q0, q1, q2, q3, q4, q5, q6, q7); + LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Fourth row of quasigroup e-transformations */ + LS1_512(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0)); + LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Edon-R tweak on the original SHA-3 Edon-R submission. */ + p[0] ^= d(8) ^ p0; + p[1] ^= d(9) ^ p1; + p[2] ^= d(10) ^ p2; + p[3] ^= d(11) ^ p3; + p[4] ^= d(12) ^ p4; + p[5] ^= d(13) ^ p5; + p[6] ^= d(14) ^ p6; + p[7] ^= d(15) ^ p7; + p[8] ^= d(0) ^ q0; + p[9] ^= d(1) ^ q1; + p[10] ^= d(2) ^ q2; + p[11] ^= d(3) ^ q3; + p[12] ^= d(4) ^ q4; + p[13] ^= d(5) ^ q5; + p[14] ^= d(6) ^ q6; + p[15] ^= d(7) ^ q7; + } + +#undef d + return (bitlen - bl); +} + +void +EdonRInit(EdonRState *state, size_t hashbitlen) +{ + ASSERT(EDONR_VALID_HASHBITLEN(hashbitlen)); + switch (hashbitlen) { + case 224: + state->hashbitlen = 224; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i224p2, hashState224(state)->DoublePipe, + 16 * sizeof (uint32_t)); + break; + + case 256: + state->hashbitlen = 256; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i256p2, hashState256(state)->DoublePipe, + 16 * sizeof (uint32_t)); + break; + + case 384: + state->hashbitlen = 384; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i384p2, hashState384(state)->DoublePipe, + 16 * sizeof (uint64_t)); + break; + + case 512: + state->hashbitlen = 512; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i512p2, hashState224(state)->DoublePipe, + 16 * sizeof (uint64_t)); + break; + } +} + + +void +EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen) +{ + uint32_t *data32; + uint64_t *data64; + + size_t bits_processed; + + ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen)); + switch (state->hashbitlen) { + case 224: + case 256: + if (state->unprocessed_bits > 0) { + /* LastBytes = databitlen / 8 */ + int LastBytes = (int)databitlen >> 3; + + ASSERT(state->unprocessed_bits + databitlen <= + EdonR256_BLOCK_SIZE * 8); + + bcopy(data, hashState256(state)->LastPart + + (state->unprocessed_bits >> 3), LastBytes); + state->unprocessed_bits += (int)databitlen; + databitlen = state->unprocessed_bits; + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data32 = (uint32_t *)hashState256(state)->LastPart; + } else + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data32 = (uint32_t *)data; + + bits_processed = Q256(databitlen, data32, + hashState256(state)->DoublePipe); + state->bits_processed += bits_processed; + databitlen -= bits_processed; + state->unprocessed_bits = (int)databitlen; + if (databitlen > 0) { + /* LastBytes = Ceil(databitlen / 8) */ + int LastBytes = + ((~(((-(int)databitlen) >> 3) & 0x01ff)) + + 1) & 0x01ff; + + data32 += bits_processed >> 5; /* byte size update */ + bcopy(data32, hashState256(state)->LastPart, LastBytes); + } + break; + + case 384: + case 512: + if (state->unprocessed_bits > 0) { + /* LastBytes = databitlen / 8 */ + int LastBytes = (int)databitlen >> 3; + + ASSERT(state->unprocessed_bits + databitlen <= + EdonR512_BLOCK_SIZE * 8); + + bcopy(data, hashState512(state)->LastPart + + (state->unprocessed_bits >> 3), LastBytes); + state->unprocessed_bits += (int)databitlen; + databitlen = state->unprocessed_bits; + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)hashState512(state)->LastPart; + } else + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)data; + + bits_processed = Q512(databitlen, data64, + hashState512(state)->DoublePipe); + state->bits_processed += bits_processed; + databitlen -= bits_processed; + state->unprocessed_bits = (int)databitlen; + if (databitlen > 0) { + /* LastBytes = Ceil(databitlen / 8) */ + int LastBytes = + ((~(((-(int)databitlen) >> 3) & 0x03ff)) + + 1) & 0x03ff; + + data64 += bits_processed >> 6; /* byte size update */ + bcopy(data64, hashState512(state)->LastPart, LastBytes); + } + break; + } +} + +void +EdonRFinal(EdonRState *state, uint8_t *hashval) +{ + uint32_t *data32; + uint64_t *data64, num_bits; + + size_t databitlen; + int LastByte, PadOnePosition; + + num_bits = state->bits_processed + state->unprocessed_bits; + ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen)); + switch (state->hashbitlen) { + case 224: + case 256: + LastByte = (int)state->unprocessed_bits >> 3; + PadOnePosition = 7 - (state->unprocessed_bits & 0x07); + hashState256(state)->LastPart[LastByte] = + (hashState256(state)->LastPart[LastByte] + & (0xff << (PadOnePosition + 1))) ^ + (0x01 << PadOnePosition); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)hashState256(state)->LastPart; + + if (state->unprocessed_bits < 448) { + (void) memset((hashState256(state)->LastPart) + + LastByte + 1, 0x00, + EdonR256_BLOCK_SIZE - LastByte - 9); + databitlen = EdonR256_BLOCK_SIZE * 8; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 7); +#else + data64[7] = num_bits; +#endif + } else { + (void) memset((hashState256(state)->LastPart) + + LastByte + 1, 0x00, + EdonR256_BLOCK_SIZE * 2 - LastByte - 9); + databitlen = EdonR256_BLOCK_SIZE * 16; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 15); +#else + data64[15] = num_bits; +#endif + } + + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data32 = (uint32_t *)hashState256(state)->LastPart; + state->bits_processed += Q256(databitlen, data32, + hashState256(state)->DoublePipe); + break; + + case 384: + case 512: + LastByte = (int)state->unprocessed_bits >> 3; + PadOnePosition = 7 - (state->unprocessed_bits & 0x07); + hashState512(state)->LastPart[LastByte] = + (hashState512(state)->LastPart[LastByte] + & (0xff << (PadOnePosition + 1))) ^ + (0x01 << PadOnePosition); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)hashState512(state)->LastPart; + + if (state->unprocessed_bits < 960) { + (void) memset((hashState512(state)->LastPart) + + LastByte + 1, 0x00, + EdonR512_BLOCK_SIZE - LastByte - 9); + databitlen = EdonR512_BLOCK_SIZE * 8; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 15); +#else + data64[15] = num_bits; +#endif + } else { + (void) memset((hashState512(state)->LastPart) + + LastByte + 1, 0x00, + EdonR512_BLOCK_SIZE * 2 - LastByte - 9); + databitlen = EdonR512_BLOCK_SIZE * 16; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 31); +#else + data64[31] = num_bits; +#endif + } + + state->bits_processed += Q512(databitlen, data64, + hashState512(state)->DoublePipe); + break; + } + + switch (state->hashbitlen) { + case 224: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint32_t *d32 = (uint32_t *)hashval; + uint32_t *s32 = hashState224(state)->DoublePipe + 9; + int j; + + for (j = 0; j < EdonR224_DIGEST_SIZE >> 2; j++) + st_swap32(s32[j], d32 + j); +#else + bcopy(hashState256(state)->DoublePipe + 9, hashval, + EdonR224_DIGEST_SIZE); +#endif + break; + } + case 256: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint32_t *d32 = (uint32_t *)hashval; + uint32_t *s32 = hashState224(state)->DoublePipe + 8; + int j; + + for (j = 0; j < EdonR256_DIGEST_SIZE >> 2; j++) + st_swap32(s32[j], d32 + j); +#else + bcopy(hashState256(state)->DoublePipe + 8, hashval, + EdonR256_DIGEST_SIZE); +#endif + break; + } + case 384: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint64_t *d64 = (uint64_t *)hashval; + uint64_t *s64 = hashState384(state)->DoublePipe + 10; + int j; + + for (j = 0; j < EdonR384_DIGEST_SIZE >> 3; j++) + st_swap64(s64[j], d64 + j); +#else + bcopy(hashState384(state)->DoublePipe + 10, hashval, + EdonR384_DIGEST_SIZE); +#endif + break; + } + case 512: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint64_t *d64 = (uint64_t *)hashval; + uint64_t *s64 = hashState512(state)->DoublePipe + 8; + int j; + + for (j = 0; j < EdonR512_DIGEST_SIZE >> 3; j++) + st_swap64(s64[j], d64 + j); +#else + bcopy(hashState512(state)->DoublePipe + 8, hashval, + EdonR512_DIGEST_SIZE); +#endif + break; + } + } +} + + +void +EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen, + uint8_t *hashval) +{ + EdonRState state; + + EdonRInit(&state, hashbitlen); + EdonRUpdate(&state, data, databitlen); + EdonRFinal(&state, hashval); +} + +#ifdef _KERNEL +EXPORT_SYMBOL(EdonRInit); +EXPORT_SYMBOL(EdonRUpdate); +EXPORT_SYMBOL(EdonRHash); +EXPORT_SYMBOL(EdonRFinal); +#endif diff --git a/module/icp/algs/edonr/edonr_byteorder.h b/module/icp/algs/edonr/edonr_byteorder.h new file mode 100644 index 000000000..d17e8f1fd --- /dev/null +++ b/module/icp/algs/edonr/edonr_byteorder.h @@ -0,0 +1,216 @@ +/* + * IDI,NTNU + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]> + * + * C header file to determine compile machine byte order. Take care when cross + * compiling. + * + * $Id: byteorder.h 517 2013-02-17 20:34:39Z joern $ + */ +/* + * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved + */ + +#ifndef _CRYPTO_EDONR_BYTEORDER_H +#define _CRYPTO_EDONR_BYTEORDER_H + + +#include <sys/param.h> + +#if defined(__BYTE_ORDER) +#if (__BYTE_ORDER == __BIG_ENDIAN) +#define MACHINE_IS_BIG_ENDIAN +#elif (__BYTE_ORDER == __LITTLE_ENDIAN) +#define MACHINE_IS_LITTLE_ENDIAN +#endif +#elif defined(BYTE_ORDER) +#if (BYTE_ORDER == BIG_ENDIAN) +#define MACHINE_IS_BIG_ENDIAN +#elif (BYTE_ORDER == LITTLE_ENDIAN) +#define MACHINE_IS_LITTLE_ENDIAN +#endif +#endif /* __BYTE_ORDER || BYTE_ORDER */ + +#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN) +#if defined(_BIG_ENDIAN) || defined(_MIPSEB) +#define MACHINE_IS_BIG_ENDIAN +#endif +#if defined(_LITTLE_ENDIAN) || defined(_MIPSEL) +#define MACHINE_IS_LITTLE_ENDIAN +#endif +#endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */ + +#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN) +#error unknown machine byte sex +#endif + +#define BYTEORDER_INCLUDED + +#if defined(MACHINE_IS_BIG_ENDIAN) +/* + * Byte swapping macros for big endian architectures and compilers, + * add as appropriate for other architectures and/or compilers. + * + * ld_swap64(src,dst) : uint64_t dst = *(src) + * st_swap64(src,dst) : *(dst) = uint64_t src + */ + +#if defined(__PPC__) || defined(_ARCH_PPC) + +#if defined(__64BIT__) +#if defined(_ARCH_PWR7) +#define aix_ld_swap64(s64, d64)\ + __asm__("ldbrx %0,0,%1" : "=r"(d64) : "r"(s64)) +#define aix_st_swap64(s64, d64)\ + __asm__ volatile("stdbrx %1,0,%0" : : "r"(d64), "r"(s64)) +#else +#define aix_ld_swap64(s64, d64) \ +{ \ + uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \ + \ + __asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\ + : "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64)); \ +} + +#define aix_st_swap64(s64, d64) \ +{ \ + uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \ + h = (s64) >> 32; \ + __asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \ + : "+r"(s4) : "r"(s64), "r"(h), "b"(d64)); \ +} +#endif /* 64BIT && PWR7 */ +#else +#define aix_ld_swap64(s64, d64) \ +{ \ + uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\ + __asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0" \ + : "+r"(s4), "=r"(l), "=r"(h) : "b"(s64)); \ + d64 = ((uint64_t)h<<32) | l; \ +} + +#define aix_st_swap64(s64, d64) \ +{ \ + uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\ + l = (s64) & 0xfffffffful, h = (s64) >> 32; \ + __asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \ + : "+r"(s4) : "r"(l), "r"(h), "b"(d64)); \ +} +#endif /* __64BIT__ */ +#define aix_ld_swap32(s32, d32)\ + __asm__("lwbrx %0,0,%1" : "=r"(d32) : "r"(s32)) +#define aix_st_swap32(s32, d32)\ + __asm__ volatile("stwbrx %1,0,%0" : : "r"(d32), "r"(s32)) +#define ld_swap32(s, d) aix_ld_swap32(s, d) +#define st_swap32(s, d) aix_st_swap32(s, d) +#define ld_swap64(s, d) aix_ld_swap64(s, d) +#define st_swap64(s, d) aix_st_swap64(s, d) +#endif /* __PPC__ || _ARCH_PPC */ + +#if defined(__sparc) +#if !defined(__arch64__) && !defined(__sparcv8) && defined(__sparcv9) +#define __arch64__ +#endif +#if defined(__GNUC__) || (defined(__SUNPRO_C) && __SUNPRO_C > 0x590) +/* need Sun Studio C 5.10 and above for GNU inline assembly */ +#if defined(__arch64__) +#define sparc_ld_swap64(s64, d64) \ + __asm__("ldxa [%1]0x88,%0" : "=r"(d64) : "r"(s64)) +#define sparc_st_swap64(s64, d64) \ + __asm__ volatile("stxa %0,[%1]0x88" : : "r"(s64), "r"(d64)) +#define st_swap64(s, d) sparc_st_swap64(s, d) +#else +#define sparc_ld_swap64(s64, d64) \ +{ \ + uint32_t *s4, h, l; \ + __asm__("add %3,4,%0\n\tlda [%3]0x88,%1\n\tlda [%0]0x88,%2" \ + : "+r"(s4), "=r"(l), "=r"(h) : "r"(s64)); \ + d64 = ((uint64_t)h<<32) | l; \ +} +#define sparc_st_swap64(s64, d64) \ +{ \ + uint32_t *s4, h, l; \ + l = (s64) & 0xfffffffful, h = (s64) >> 32; \ + __asm__ volatile("add %3,4,%0\n\tsta %1,[%3]0x88\n\tsta %2,[%0]0x88"\ + : "+r"(s4) : "r"(l), "r"(h), "r"(d64)); \ +} +#endif /* sparc64 */ +#define sparc_ld_swap32(s32, d32)\ + __asm__("lda [%1]0x88,%0" : "=r"(d32) : "r"(s32)) +#define sparc_st_swap32(s32, d32)\ + __asm__ volatile("sta %0,[%1]0x88" : : "r"(s32), "r"(d32)) +#define ld_swap32(s, d) sparc_ld_swap32(s, d) +#define st_swap32(s, d) sparc_st_swap32(s, d) +#define ld_swap64(s, d) sparc_ld_swap64(s, d) +#define st_swap64(s, d) sparc_st_swap64(s, d) +#endif /* GCC || Sun Studio C > 5.9 */ +#endif /* sparc */ + +/* GCC fallback */ +#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap32) +#define ld_swap32(s, d) (d = __builtin_bswap32(*(s))) +#define st_swap32(s, d) (*(d) = __builtin_bswap32(s)) +#endif /* GCC4/PGIC && !swap32 */ +#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap64) +#define ld_swap64(s, d) (d = __builtin_bswap64(*(s))) +#define st_swap64(s, d) (*(d) = __builtin_bswap64(s)) +#endif /* GCC4/PGIC && !swap64 */ + +/* generic fallback */ +#if !defined(ld_swap32) +#define ld_swap32(s, d) \ + (d = (*(s) >> 24) | (*(s) >> 8 & 0xff00) | \ + (*(s) << 8 & 0xff0000) | (*(s) << 24)) +#define st_swap32(s, d) \ + (*(d) = ((s) >> 24) | ((s) >> 8 & 0xff00) | \ + ((s) << 8 & 0xff0000) | ((s) << 24)) +#endif +#if !defined(ld_swap64) +#define ld_swap64(s, d) \ + (d = (*(s) >> 56) | (*(s) >> 40 & 0xff00) | \ + (*(s) >> 24 & 0xff0000) | (*(s) >> 8 & 0xff000000) | \ + (*(s) & 0xff000000) << 8 | (*(s) & 0xff0000) << 24 | \ + (*(s) & 0xff00) << 40 | *(s) << 56) +#define st_swap64(s, d) \ + (*(d) = ((s) >> 56) | ((s) >> 40 & 0xff00) | \ + ((s) >> 24 & 0xff0000) | ((s) >> 8 & 0xff000000) | \ + ((s) & 0xff000000) << 8 | ((s) & 0xff0000) << 24 | \ + ((s) & 0xff00) << 40 | (s) << 56) +#endif + +#endif /* MACHINE_IS_BIG_ENDIAN */ + + +#if defined(MACHINE_IS_LITTLE_ENDIAN) +/* replace swaps with simple assignments on little endian systems */ +#undef ld_swap32 +#undef st_swap32 +#define ld_swap32(s, d) (d = *(s)) +#define st_swap32(s, d) (*(d) = s) +#undef ld_swap64 +#undef st_swap64 +#define ld_swap64(s, d) (d = *(s)) +#define st_swap64(s, d) (*(d) = s) +#endif /* MACHINE_IS_LITTLE_ENDIAN */ + +#endif /* _CRYPTO_EDONR_BYTEORDER_H */ diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c index 792ca8825..dbe008190 100644 --- a/module/icp/algs/sha2/sha2.c +++ b/module/icp/algs/sha2/sha2.c @@ -38,7 +38,7 @@ #include <sys/zfs_context.h> #define _SHA2_IMPL -#include <sha2/sha2.h> +#include <sys/sha2.h> #include <sha2/sha2_consts.h> #define _RESTRICT_KYWD @@ -47,18 +47,37 @@ #include <sys/byteorder.h> #define HAVE_HTONL #endif +#include <sys/isa_defs.h> /* for _ILP32 */ static void Encode(uint8_t *, uint32_t *, size_t); +static void Encode64(uint8_t *, uint64_t *, size_t); #if defined(__amd64) +#define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1) #define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1) + +void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); + #else static void SHA256Transform(SHA2_CTX *, const uint8_t *); +static void SHA512Transform(SHA2_CTX *, const uint8_t *); #endif /* __amd64 */ static uint8_t PADDING[128] = { 0x80, /* all zeros */ }; +/* + * The low-level checksum routines use a lot of stack space. On systems where + * small stacks are enforced (like 32-bit kernel builds), insert compiler memory + * barriers to reduce stack frame size. This can reduce the SHA512Transform() + * stack frame usage from 3k to <1k on ARM32, for example. + */ +#if defined(_ILP32) || defined(__powerpc) /* small stack */ +#define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory"); +#else +#define SMALL_STACK_MEMORY_BARRIER +#endif + /* Ch and Maj are the basic SHA2 functions. */ #define Ch(b, c, d) (((b) & (c)) ^ ((~b) & (d))) #define Maj(b, c, d) (((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d))) @@ -82,6 +101,18 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ }; T2 = BIGSIGMA0_256(a) + Maj(a, b, c); \ h = T1 + T2 +/* SHA384/512 Functions */ +#define BIGSIGMA0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39)) +#define BIGSIGMA1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41)) +#define SIGMA0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7)) +#define SIGMA1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6)) +#define SHA512ROUND(a, b, c, d, e, f, g, h, i, w) \ + T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w; \ + d += T1; \ + T2 = BIGSIGMA0(a) + Maj(a, b, c); \ + h = T1 + T2; \ + SMALL_STACK_MEMORY_BARRIER; + /* * sparc optimization: * @@ -130,6 +161,33 @@ SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk) uint32_t w8, w9, w10, w11, w12, w13, w14, w15; uint32_t T1, T2; +#if defined(__sparc) + static const uint32_t sha256_consts[] = { + SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2, + SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5, + SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8, + SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11, + SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14, + SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17, + SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20, + SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23, + SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26, + SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29, + SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32, + SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35, + SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38, + SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41, + SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44, + SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47, + SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50, + SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53, + SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56, + SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59, + SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62, + SHA256_CONST_63 + }; +#endif /* __sparc */ + if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */ bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32)); blk = (uint8_t *)ctx->buf_un.buf32; @@ -292,6 +350,256 @@ SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk) ctx->state.s32[6] += g; ctx->state.s32[7] += h; } + + +/* SHA384 and SHA512 Transform */ + +static void +SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk) +{ + + uint64_t a = ctx->state.s64[0]; + uint64_t b = ctx->state.s64[1]; + uint64_t c = ctx->state.s64[2]; + uint64_t d = ctx->state.s64[3]; + uint64_t e = ctx->state.s64[4]; + uint64_t f = ctx->state.s64[5]; + uint64_t g = ctx->state.s64[6]; + uint64_t h = ctx->state.s64[7]; + + uint64_t w0, w1, w2, w3, w4, w5, w6, w7; + uint64_t w8, w9, w10, w11, w12, w13, w14, w15; + uint64_t T1, T2; + +#if defined(__sparc) + static const uint64_t sha512_consts[] = { + SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2, + SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5, + SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8, + SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11, + SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14, + SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17, + SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20, + SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23, + SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26, + SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29, + SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32, + SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35, + SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38, + SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41, + SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44, + SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47, + SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50, + SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53, + SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56, + SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59, + SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62, + SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65, + SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68, + SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71, + SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74, + SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77, + SHA512_CONST_78, SHA512_CONST_79 + }; +#endif /* __sparc */ + + + if ((uintptr_t)blk & 0x7) { /* not 8-byte aligned? */ + bcopy(blk, ctx->buf_un.buf64, sizeof (ctx->buf_un.buf64)); + blk = (uint8_t *)ctx->buf_un.buf64; + } + + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w0 = LOAD_BIG_64(blk + 8 * 0); + SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w1 = LOAD_BIG_64(blk + 8 * 1); + SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w2 = LOAD_BIG_64(blk + 8 * 2); + SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w3 = LOAD_BIG_64(blk + 8 * 3); + SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w4 = LOAD_BIG_64(blk + 8 * 4); + SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w5 = LOAD_BIG_64(blk + 8 * 5); + SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w6 = LOAD_BIG_64(blk + 8 * 6); + SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w7 = LOAD_BIG_64(blk + 8 * 7); + SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w8 = LOAD_BIG_64(blk + 8 * 8); + SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w9 = LOAD_BIG_64(blk + 8 * 9); + SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w10 = LOAD_BIG_64(blk + 8 * 10); + SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w11 = LOAD_BIG_64(blk + 8 * 11); + SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w12 = LOAD_BIG_64(blk + 8 * 12); + SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w13 = LOAD_BIG_64(blk + 8 * 13); + SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w14 = LOAD_BIG_64(blk + 8 * 14); + SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w15 = LOAD_BIG_64(blk + 8 * 15); + SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15); + + ctx->state.s64[0] += a; + ctx->state.s64[1] += b; + ctx->state.s64[2] += c; + ctx->state.s64[3] += d; + ctx->state.s64[4] += e; + ctx->state.s64[5] += f; + ctx->state.s64[6] += g; + ctx->state.s64[7] += h; + +} #endif /* !__amd64 */ @@ -311,14 +619,56 @@ Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input, { size_t i, j; - for (i = 0, j = 0; j < len; i++, j += 4) { - output[j] = (input[i] >> 24) & 0xff; - output[j + 1] = (input[i] >> 16) & 0xff; - output[j + 2] = (input[i] >> 8) & 0xff; - output[j + 3] = input[i] & 0xff; +#if defined(__sparc) + if (IS_P2ALIGNED(output, sizeof (uint32_t))) { + for (i = 0, j = 0; j < len; i++, j += 4) { + /* LINTED E_BAD_PTR_CAST_ALIGN */ + *((uint32_t *)(output + j)) = input[i]; + } + } else { +#endif /* little endian -- will work on big endian, but slowly */ + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (input[i] >> 24) & 0xff; + output[j + 1] = (input[i] >> 16) & 0xff; + output[j + 2] = (input[i] >> 8) & 0xff; + output[j + 3] = input[i] & 0xff; + } +#if defined(__sparc) } +#endif } +static void +Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input, + size_t len) +{ + size_t i, j; + +#if defined(__sparc) + if (IS_P2ALIGNED(output, sizeof (uint64_t))) { + for (i = 0, j = 0; j < len; i++, j += 8) { + /* LINTED E_BAD_PTR_CAST_ALIGN */ + *((uint64_t *)(output + j)) = input[i]; + } + } else { +#endif /* little endian -- will work on big endian, but slowly */ + for (i = 0, j = 0; j < len; i++, j += 8) { + + output[j] = (input[i] >> 56) & 0xff; + output[j + 1] = (input[i] >> 48) & 0xff; + output[j + 2] = (input[i] >> 40) & 0xff; + output[j + 3] = (input[i] >> 32) & 0xff; + output[j + 4] = (input[i] >> 24) & 0xff; + output[j + 5] = (input[i] >> 16) & 0xff; + output[j + 6] = (input[i] >> 8) & 0xff; + output[j + 7] = input[i] & 0xff; + } +#if defined(__sparc) + } +#endif +} + + void SHA2Init(uint64_t mech, SHA2_CTX *ctx) { @@ -336,22 +686,86 @@ SHA2Init(uint64_t mech, SHA2_CTX *ctx) ctx->state.s32[6] = 0x1f83d9abU; ctx->state.s32[7] = 0x5be0cd19U; break; + case SHA384_MECH_INFO_TYPE: + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->state.s64[1] = 0x629a292a367cd507ULL; + ctx->state.s64[2] = 0x9159015a3070dd17ULL; + ctx->state.s64[3] = 0x152fecd8f70e5939ULL; + ctx->state.s64[4] = 0x67332667ffc00b31ULL; + ctx->state.s64[5] = 0x8eb44a8768581511ULL; + ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL; + break; + case SHA512_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + ctx->state.s64[0] = 0x6a09e667f3bcc908ULL; + ctx->state.s64[1] = 0xbb67ae8584caa73bULL; + ctx->state.s64[2] = 0x3c6ef372fe94f82bULL; + ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL; + ctx->state.s64[4] = 0x510e527fade682d1ULL; + ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL; + ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL; + ctx->state.s64[7] = 0x5be0cd19137e2179ULL; + break; + case SHA512_224_MECH_INFO_TYPE: + ctx->state.s64[0] = 0x8C3D37C819544DA2ULL; + ctx->state.s64[1] = 0x73E1996689DCD4D6ULL; + ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL; + ctx->state.s64[3] = 0x679DD514582F9FCFULL; + ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL; + ctx->state.s64[5] = 0x77E36F7304C48942ULL; + ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL; + ctx->state.s64[7] = 0x1112E6AD91D692A1ULL; + break; + case SHA512_256_MECH_INFO_TYPE: + ctx->state.s64[0] = 0x22312194FC2BF72CULL; + ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL; + ctx->state.s64[2] = 0x2393B86B6F53B151ULL; + ctx->state.s64[3] = 0x963877195940EABDULL; + ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL; + ctx->state.s64[5] = 0xBE5E1E2553863992ULL; + ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL; + ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL; + break; +#ifdef _KERNEL default: cmn_err(CE_PANIC, "sha2_init: failed to find a supported algorithm: 0x%x", (uint32_t)mech); + +#endif /* _KERNEL */ } ctx->algotype = (uint32_t)mech; ctx->count.c64[0] = ctx->count.c64[1] = 0; } +#ifndef _KERNEL + +// #pragma inline(SHA256Init, SHA384Init, SHA512Init) void SHA256Init(SHA256_CTX *ctx) { SHA2Init(SHA256, ctx); } +void +SHA384Init(SHA384_CTX *ctx) +{ + SHA2Init(SHA384, ctx); +} + +void +SHA512Init(SHA512_CTX *ctx) +{ + SHA2Init(SHA512, ctx); +} + +#endif /* _KERNEL */ + /* * SHA2Update() * @@ -422,6 +836,8 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len) bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len); if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) SHA256Transform(ctx, ctx->buf_un.buf8); + else + SHA512Transform(ctx, ctx->buf_un.buf8); i = buf_len; } @@ -431,6 +847,10 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len) for (; i + buf_limit - 1 < input_len; i += buf_limit) { SHA256Transform(ctx, &input[i]); } + } else { + for (; i + buf_limit - 1 < input_len; i += buf_limit) { + SHA512Transform(ctx, &input[i]); + } } #else @@ -441,6 +861,13 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len) block_count); i += block_count << 6; } + } else { + block_count = (input_len - i) >> 7; + if (block_count > 0) { + SHA512TransformBlocks(ctx, &input[i], + block_count); + i += block_count << 7; + } } #endif /* !__amd64 */ @@ -479,6 +906,7 @@ void SHA2Final(void *digest, SHA2_CTX *ctx) { uint8_t bitcount_be[sizeof (ctx->count.c32)]; + uint8_t bitcount_be64[sizeof (ctx->count.c64)]; uint32_t index; uint32_t algotype = ctx->algotype; @@ -488,8 +916,45 @@ SHA2Final(void *digest, SHA2_CTX *ctx) SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index); SHA2Update(ctx, bitcount_be, sizeof (bitcount_be)); Encode(digest, ctx->state.s32, sizeof (ctx->state.s32)); + } else { + index = (ctx->count.c64[1] >> 3) & 0x7f; + Encode64(bitcount_be64, ctx->count.c64, + sizeof (bitcount_be64)); + SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index); + SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64)); + if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) { + ctx->state.s64[6] = ctx->state.s64[7] = 0; + Encode64(digest, ctx->state.s64, + sizeof (uint64_t) * 6); + } else if (algotype == SHA512_224_MECH_INFO_TYPE) { + uint8_t last[sizeof (uint64_t)]; + /* + * Since SHA-512/224 doesn't align well to 64-bit + * boundaries, we must do the encoding in three steps: + * 1) encode the three 64-bit words that fit neatly + * 2) encode the last 64-bit word to a temp buffer + * 3) chop out the lower 32-bits from the temp buffer + * and append them to the digest + */ + Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3); + Encode64(last, &ctx->state.s64[3], sizeof (uint64_t)); + bcopy(last, (uint8_t *)digest + 24, 4); + } else if (algotype == SHA512_256_MECH_INFO_TYPE) { + Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4); + } else { + Encode64(digest, ctx->state.s64, + sizeof (ctx->state.s64)); + } } /* zeroize sensitive information */ bzero(ctx, sizeof (*ctx)); } + + + +#ifdef _KERNEL +EXPORT_SYMBOL(SHA2Init); +EXPORT_SYMBOL(SHA2Update); +EXPORT_SYMBOL(SHA2Final); +#endif diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE b/module/icp/algs/skein/THIRDPARTYLICENSE new file mode 100644 index 000000000..b7434fd17 --- /dev/null +++ b/module/icp/algs/skein/THIRDPARTYLICENSE @@ -0,0 +1,3 @@ +Implementation of the Skein hash function. +Source code author: Doug Whiting, 2008. +This algorithm and source code is released to the public domain. diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE.descrip b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip new file mode 100644 index 000000000..0ae89cfdf --- /dev/null +++ b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +LICENSE TERMS OF SKEIN HASH ALGORITHM IMPLEMENTATION diff --git a/module/icp/algs/skein/skein.c b/module/icp/algs/skein/skein.c new file mode 100644 index 000000000..0981eee08 --- /dev/null +++ b/module/icp/algs/skein/skein.c @@ -0,0 +1,921 @@ +/* + * Implementation of the Skein hash function. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#define SKEIN_PORT_CODE /* instantiate any code in skein_port.h */ + +#include <sys/types.h> +#include <sys/note.h> +#include <sys/skein.h> /* get the Skein API definitions */ +#include "skein_impl.h" /* get internal definitions */ + +/* External function to process blkCnt (nonzero) full block(s) of data. */ +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); + +/* 256-bit Skein */ +/* init the context for a straight hashing operation */ +int +Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen) +{ + union { + uint8_t b[SKEIN_256_STATE_BYTES]; + uint64_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 256: + bcopy(SKEIN_256_IV_256, ctx->X, sizeof (ctx->X)); + break; + case 224: + bcopy(SKEIN_256_IV_224, ctx->X, sizeof (ctx->X)); + break; + case 160: + bcopy(SKEIN_256_IV_160, ctx->X, sizeof (ctx->X)); + break; + case 128: + bcopy(SKEIN_256_IV_128, ctx->X, sizeof (ctx->X)); + break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* + * build/process the config block, type == CONFIG (could be + * precomputed) + */ + /* set tweaks: T0=0; T1=CFG | FINAL */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + /* set the schema, version */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + /* zero pad config block */ + bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0])); + + /* compute the initial chaining values from config block */ + /* zero the chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } + /* + * The chaining vars ctx->X are now initialized for the given + * hashBitLen. + * Set up to process the data message portion of the hash (default) + */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ + + return (SKEIN_SUCCESS); +} + +/* init the context for a MAC and/or tree hash operation */ +/* + * [identical to Skein_256_Init() when keyBytes == 0 && + * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] + */ +int +Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo, + const uint8_t *key, size_t keyBytes) +{ + union { + uint8_t b[SKEIN_256_STATE_BYTES]; + uint64_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) { /* is there a key? */ + /* no key: use all zeroes as key for config block */ + bzero(ctx->X, sizeof (ctx->X)); + } else { /* here to pre-process a key */ + + Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X)); + /* do a mini-Init right here */ + /* set output hash bit count = state size */ + ctx->h.hashBitLen = 8 * sizeof (ctx->X); + /* set tweaks: T0 = 0; T1 = KEY type */ + Skein_Start_New_Type(ctx, KEY); + /* zero the initial chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + /* hash the key */ + (void) Skein_256_Update(ctx, key, keyBytes); + /* put result into cfg.b[] */ + (void) Skein_256_Final_Pad(ctx, cfg.b); + /* copy over into ctx->X[] */ + bcopy(cfg.b, ctx->X, sizeof (cfg.b)); +#if SKEIN_NEED_SWAP + { + uint_t i; + /* convert key bytes to context words */ + for (i = 0; i < SKEIN_256_STATE_WORDS; i++) + ctx->X[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* + * build/process the config block, type == CONFIG (could be + * precomputed for each key) + */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + cfg.w[2] = Skein_Swap64(treeInfo); + + Skein_Show_Key(256, &ctx->h, key, keyBytes); + + /* compute the initial chaining values from config block */ + Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx, MSG); + + return (SKEIN_SUCCESS); +} + +/* process the input bytes */ +int +Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt) +{ + size_t n; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) { + /* finish up any buffered message data */ + if (ctx->h.bCnt) { + /* # bytes free in buffer b[] */ + n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; + if (n) { + /* check on our logic here */ + Skein_assert(n < msgByteCnt); + bcopy(msg, &ctx->b[ctx->h.bCnt], n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES); + Skein_256_Process_Block(ctx, ctx->b, 1, + SKEIN_256_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* + * now process any remaining full blocks, directly from input + * message data + */ + if (msgByteCnt > SKEIN_256_BLOCK_BYTES) { + /* number of full blocks to process */ + n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES; + Skein_256_Process_Block(ctx, msg, n, + SKEIN_256_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_256_BLOCK_BYTES; + msg += n * SKEIN_256_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES); + bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the result */ +int +Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_256_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + + /* process the final block */ + Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_256_BLOCK_BYTES; + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN_256_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* 512-bit Skein */ + +/* init the context for a straight hashing operation */ +int +Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen) +{ + union { + uint8_t b[SKEIN_512_STATE_BYTES]; + uint64_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: + bcopy(SKEIN_512_IV_512, ctx->X, sizeof (ctx->X)); + break; + case 384: + bcopy(SKEIN_512_IV_384, ctx->X, sizeof (ctx->X)); + break; + case 256: + bcopy(SKEIN_512_IV_256, ctx->X, sizeof (ctx->X)); + break; + case 224: + bcopy(SKEIN_512_IV_224, ctx->X, sizeof (ctx->X)); + break; +#endif + default: + /* + * here if there is no precomputed IV value available + * build/process the config block, type == CONFIG (could be + * precomputed) + */ + /* set tweaks: T0=0; T1=CFG | FINAL */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + /* set the schema, version */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + /* zero pad config block */ + bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0])); + + /* compute the initial chaining values from config block */ + /* zero the chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } + + /* + * The chaining vars ctx->X are now initialized for the given + * hashBitLen. Set up to process the data message portion of the + * hash (default) + */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ + + return (SKEIN_SUCCESS); +} + +/* init the context for a MAC and/or tree hash operation */ +/* + * [identical to Skein_512_Init() when keyBytes == 0 && + * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] + */ +int +Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo, + const uint8_t *key, size_t keyBytes) +{ + union { + uint8_t b[SKEIN_512_STATE_BYTES]; + uint64_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) { /* is there a key? */ + /* no key: use all zeroes as key for config block */ + bzero(ctx->X, sizeof (ctx->X)); + } else { /* here to pre-process a key */ + + Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X)); + /* do a mini-Init right here */ + /* set output hash bit count = state size */ + ctx->h.hashBitLen = 8 * sizeof (ctx->X); + /* set tweaks: T0 = 0; T1 = KEY type */ + Skein_Start_New_Type(ctx, KEY); + /* zero the initial chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + (void) Skein_512_Update(ctx, key, keyBytes); /* hash the key */ + /* put result into cfg.b[] */ + (void) Skein_512_Final_Pad(ctx, cfg.b); + /* copy over into ctx->X[] */ + bcopy(cfg.b, ctx->X, sizeof (cfg.b)); +#if SKEIN_NEED_SWAP + { + uint_t i; + /* convert key bytes to context words */ + for (i = 0; i < SKEIN_512_STATE_WORDS; i++) + ctx->X[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* + * build/process the config block, type == CONFIG (could be + * precomputed for each key) + */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + cfg.w[2] = Skein_Swap64(treeInfo); + + Skein_Show_Key(512, &ctx->h, key, keyBytes); + + /* compute the initial chaining values from config block */ + Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx, MSG); + + return (SKEIN_SUCCESS); +} + +/* process the input bytes */ +int +Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt) +{ + size_t n; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { + /* finish up any buffered message data */ + if (ctx->h.bCnt) { + /* # bytes free in buffer b[] */ + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; + if (n) { + /* check on our logic here */ + Skein_assert(n < msgByteCnt); + bcopy(msg, &ctx->b[ctx->h.bCnt], n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx, ctx->b, 1, + SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* + * now process any remaining full blocks, directly from input + * message data + */ + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { + /* number of full blocks to process */ + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; + Skein_512_Process_Block(ctx, msg, n, + SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); + bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the result */ +int +Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_512_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + + /* process the final block */ + Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(512, &ctx->h, n, + hashVal + i * SKEIN_512_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* 1024-bit Skein */ + +/* init the context for a straight hashing operation */ +int +Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen) +{ + union { + uint8_t b[SKEIN1024_STATE_BYTES]; + uint64_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: + bcopy(SKEIN1024_IV_512, ctx->X, sizeof (ctx->X)); + break; + case 384: + bcopy(SKEIN1024_IV_384, ctx->X, sizeof (ctx->X)); + break; + case 1024: + bcopy(SKEIN1024_IV_1024, ctx->X, sizeof (ctx->X)); + break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* + * build/process the config block, type == CONFIG (could be + * precomputed) + */ + /* set tweaks: T0=0; T1=CFG | FINAL */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + /* set the schema, version */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + /* zero pad config block */ + bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0])); + + /* compute the initial chaining values from config block */ + /* zero the chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } + + /* + * The chaining vars ctx->X are now initialized for the given + * hashBitLen. Set up to process the data message portion of the hash + * (default) + */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ + + return (SKEIN_SUCCESS); +} + +/* init the context for a MAC and/or tree hash operation */ +/* + * [identical to Skein1024_Init() when keyBytes == 0 && + * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] + */ +int +Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo, + const uint8_t *key, size_t keyBytes) +{ + union { + uint8_t b[SKEIN1024_STATE_BYTES]; + uint64_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) { /* is there a key? */ + /* no key: use all zeroes as key for config block */ + bzero(ctx->X, sizeof (ctx->X)); + } else { /* here to pre-process a key */ + Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X)); + /* do a mini-Init right here */ + /* set output hash bit count = state size */ + ctx->h.hashBitLen = 8 * sizeof (ctx->X); + /* set tweaks: T0 = 0; T1 = KEY type */ + Skein_Start_New_Type(ctx, KEY); + /* zero the initial chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + (void) Skein1024_Update(ctx, key, keyBytes); /* hash the key */ + /* put result into cfg.b[] */ + (void) Skein1024_Final_Pad(ctx, cfg.b); + /* copy over into ctx->X[] */ + bcopy(cfg.b, ctx->X, sizeof (cfg.b)); +#if SKEIN_NEED_SWAP + { + uint_t i; + /* convert key bytes to context words */ + for (i = 0; i < SKEIN1024_STATE_WORDS; i++) + ctx->X[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* + * build/process the config block, type == CONFIG (could be + * precomputed for each key) + */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + cfg.w[2] = Skein_Swap64(treeInfo); + + Skein_Show_Key(1024, &ctx->h, key, keyBytes); + + /* compute the initial chaining values from config block */ + Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx, MSG); + + return (SKEIN_SUCCESS); +} + +/* process the input bytes */ +int +Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt) +{ + size_t n; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) { + /* finish up any buffered message data */ + if (ctx->h.bCnt) { + /* # bytes free in buffer b[] */ + n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; + if (n) { + /* check on our logic here */ + Skein_assert(n < msgByteCnt); + bcopy(msg, &ctx->b[ctx->h.bCnt], n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES); + Skein1024_Process_Block(ctx, ctx->b, 1, + SKEIN1024_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* + * now process any remaining full blocks, directly from + * input message data + */ + if (msgByteCnt > SKEIN1024_BLOCK_BYTES) { + /* number of full blocks to process */ + n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES; + Skein1024_Process_Block(ctx, msg, n, + SKEIN1024_BLOCK_BYTES); + msgByteCnt -= n * SKEIN1024_BLOCK_BYTES; + msg += n * SKEIN1024_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES); + bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the result */ +int +Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN1024_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + + /* process the final block */ + Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN1024_BLOCK_BYTES; + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(1024, &ctx->h, n, + hashVal + i * SKEIN1024_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* Functions to support MAC/tree hashing */ +/* (this code is identical for Optimized and Reference versions) */ + +/* finalize the hash computation and output the block, no OUTPUT stage */ +int +Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal) +{ + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + /* process the final block */ + Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* "output" the state bytes */ + Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_256_BLOCK_BYTES); + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the block, no OUTPUT stage */ +int +Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal) +{ + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + /* process the final block */ + Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* "output" the state bytes */ + Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_512_BLOCK_BYTES); + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the block, no OUTPUT stage */ +int +Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal) +{ + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + /* tag as the final block */ + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + /* process the final block */ + Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* "output" the state bytes */ + Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN1024_BLOCK_BYTES); + + return (SKEIN_SUCCESS); +} + +#if SKEIN_TREE_HASH +/* just do the OUTPUT stage */ +int +Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_256_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_256_BLOCK_BYTES; + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN_256_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* just do the OUTPUT stage */ +int +Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_512_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN_512_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* just do the OUTPUT stage */ +int +Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN1024_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN1024_BLOCK_BYTES; + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN1024_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} +#endif + +#ifdef _KERNEL +EXPORT_SYMBOL(Skein_512_Init); +EXPORT_SYMBOL(Skein_512_InitExt); +EXPORT_SYMBOL(Skein_512_Update); +EXPORT_SYMBOL(Skein_512_Final); +#endif diff --git a/module/icp/algs/skein/skein_block.c b/module/icp/algs/skein/skein_block.c new file mode 100644 index 000000000..d2e811963 --- /dev/null +++ b/module/icp/algs/skein/skein_block.c @@ -0,0 +1,793 @@ +/* + * Implementation of the Skein block functions. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + * Compile-time switches: + * SKEIN_USE_ASM -- set bits (256/512/1024) to select which + * versions use ASM code for block processing + * [default: use C for all block sizes] + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#include <sys/skein.h> +#include "skein_impl.h" +#include <sys/isa_defs.h> /* for _ILP32 */ + +#ifndef SKEIN_USE_ASM +#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ +#endif + +#ifndef SKEIN_LOOP +/* + * The low-level checksum routines use a lot of stack space. On systems where + * small stacks frame are enforced (like 32-bit kernel builds), do not unroll + * checksum calculations to save stack space. + * + * Even with no loops unrolled, we still can exceed the 1k stack frame limit + * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can + * safely ignore it though, since that the checksum functions will be called + * from a worker thread that won't be using much stack. That's why we have + * the #pragma here to ignore the warning. + */ +#if defined(_ILP32) || defined(__powerpc) /* Assume small stack */ +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +/* + * We're running on 32-bit, don't unroll loops to save stack frame space + * + * Due to the ways the calculations on SKEIN_LOOP are done in + * Skein_*_Process_Block(), a value of 111 disables unrolling loops + * in any of those functions. + */ +#define SKEIN_LOOP 111 +#else +/* We're compiling with large stacks */ +#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ +#endif +#endif + +/* some useful definitions for code here */ +#define BLK_BITS (WCNT*64) +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +/* no debugging in Illumos version */ +#define DebugSaveTweak(ctx) + +/* Skein_256 */ +#if !(SKEIN_USE_ASM & 256) + +void +Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) +{ /* do it in C */ + enum { + WCNT = SKEIN_256_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) +#else +#define SKEIN_UNROLL_256 (0) +#endif + +#if SKEIN_UNROLL_256 +#if (RCNT % SKEIN_UNROLL_256) +#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ +#endif + size_t r; + /* key schedule words : chaining vars + tweak + "rotation" */ + uint64_t kw[WCNT + 4 + RCNT * 2]; +#else + uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ +#endif + /* local copy of context vars, for speed */ + uint64_t X0, X1, X2, X3; + uint64_t w[WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + /* use for debugging (help compiler put Xn in registers) */ + const uint64_t *Xptr[4]; + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; +#endif + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* + * this implementation only supports 2**64 input bytes + * (no carry out here) + */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1] + ts[0]; + X2 = w[2] + ks[2] + ts[1]; + X3 = w[3] + ks[3]; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, + Xptr); /* show starting state values */ + + blkPtr += SKEIN_256_BLOCK_BYTES; + + /* run the rounds */ + +#define Round256(p0, p1, p2, p3, ROT, rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \ + +#if SKEIN_UNROLL_256 == 0 +#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I256(R) \ + X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ + X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ + X3 += ks[((R) + 4) % 5] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R256(p0, p1, p2, p3, ROT, rNum) \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I256(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ + X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ + X3 += ks[r + (R) + 3] + r + (R); \ + ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + /* loop thru it */ + for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) +#endif + { +#define R256_8_rounds(R) \ + R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ + R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ + R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ + R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ + I256(2 * (R)); \ + R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ + R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ + R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ + R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ + I256(2 * (R) + 1); + + R256_8_rounds(0); + +#define R256_Unroll_R(NN) \ + ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \ + (SKEIN_UNROLL_256 > (NN))) + +#if R256_Unroll_R(1) + R256_8_rounds(1); +#endif +#if R256_Unroll_R(2) + R256_8_rounds(2); +#endif +#if R256_Unroll_R(3) + R256_8_rounds(3); +#endif +#if R256_Unroll_R(4) + R256_8_rounds(4); +#endif +#if R256_Unroll_R(5) + R256_8_rounds(5); +#endif +#if R256_Unroll_R(6) + R256_8_rounds(6); +#endif +#if R256_Unroll_R(7) + R256_8_rounds(7); +#endif +#if R256_Unroll_R(8) + R256_8_rounds(8); +#endif +#if R256_Unroll_R(9) + R256_8_rounds(9); +#endif +#if R256_Unroll_R(10) + R256_8_rounds(10); +#endif +#if R256_Unroll_R(11) + R256_8_rounds(11); +#endif +#if R256_Unroll_R(12) + R256_8_rounds(12); +#endif +#if R256_Unroll_R(13) + R256_8_rounds(13); +#endif +#if R256_Unroll_R(14) + R256_8_rounds(14); +#endif +#if (SKEIN_UNROLL_256 > 14) +#error "need more unrolling in Skein_256_Process_Block" +#endif + } + /* + * do the final "feedforward" xor, update context chaining vars + */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t +Skein_256_Process_Block_CodeSize(void) +{ + return ((uint8_t *)Skein_256_Process_Block_CodeSize) - + ((uint8_t *)Skein_256_Process_Block); +} + +uint_t +Skein_256_Unroll_Cnt(void) +{ + return (SKEIN_UNROLL_256); +} +#endif +#endif + +/* Skein_512 */ +#if !(SKEIN_USE_ASM & 512) +void +Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) +{ /* do it in C */ + enum { + WCNT = SKEIN_512_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) +#else +#define SKEIN_UNROLL_512 (0) +#endif + +#if SKEIN_UNROLL_512 +#if (RCNT % SKEIN_UNROLL_512) +#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ +#endif + size_t r; + /* key schedule words : chaining vars + tweak + "rotation" */ + uint64_t kw[WCNT + 4 + RCNT * 2]; +#else + uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ +#endif + /* local copy of vars, for speed */ + uint64_t X0, X1, X2, X3, X4, X5, X6, X7; + uint64_t w[WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + /* use for debugging (help compiler put Xn in registers) */ + const uint64_t *Xptr[8]; + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; + Xptr[4] = &X4; + Xptr[5] = &X5; + Xptr[6] = &X6; + Xptr[7] = &X7; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* + * this implementation only supports 2**64 input bytes + * (no carry out here) + */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, + Xptr); + /* run the rounds */ +#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ + X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ + X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; + +#if SKEIN_UNROLL_512 == 0 +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I512(R) \ + X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\ + X1 += ks[((R) + 2) % 9]; \ + X2 += ks[((R) + 3) % 9]; \ + X3 += ks[((R) + 4) % 9]; \ + X4 += ks[((R) + 5) % 9]; \ + X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ + X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ + X7 += ks[((R) + 8) % 9] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I512(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1]; \ + X2 += ks[r + (R) + 2]; \ + X3 += ks[r + (R) + 3]; \ + X4 += ks[r + (R) + 4]; \ + X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ + X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ + X7 += ks[r + (R) + 7] + r + (R); \ + ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\ + ts[r + (R)+2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + /* loop thru it */ + for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) +#endif /* end of looped code definitions */ + { +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ + I512(2 * (R)); \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ + I512(2*(R) + 1); /* and key injection */ + + R512_8_rounds(0); + +#define R512_Unroll_R(NN) \ + ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \ + (SKEIN_UNROLL_512 > (NN))) + +#if R512_Unroll_R(1) + R512_8_rounds(1); +#endif +#if R512_Unroll_R(2) + R512_8_rounds(2); +#endif +#if R512_Unroll_R(3) + R512_8_rounds(3); +#endif +#if R512_Unroll_R(4) + R512_8_rounds(4); +#endif +#if R512_Unroll_R(5) + R512_8_rounds(5); +#endif +#if R512_Unroll_R(6) + R512_8_rounds(6); +#endif +#if R512_Unroll_R(7) + R512_8_rounds(7); +#endif +#if R512_Unroll_R(8) + R512_8_rounds(8); +#endif +#if R512_Unroll_R(9) + R512_8_rounds(9); +#endif +#if R512_Unroll_R(10) + R512_8_rounds(10); +#endif +#if R512_Unroll_R(11) + R512_8_rounds(11); +#endif +#if R512_Unroll_R(12) + R512_8_rounds(12); +#endif +#if R512_Unroll_R(13) + R512_8_rounds(13); +#endif +#if R512_Unroll_R(14) + R512_8_rounds(14); +#endif +#if (SKEIN_UNROLL_512 > 14) +#error "need more unrolling in Skein_512_Process_Block" +#endif + } + + /* + * do the final "feedforward" xor, update context chaining vars + */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } + while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t +Skein_512_Process_Block_CodeSize(void) +{ + return ((uint8_t *)Skein_512_Process_Block_CodeSize) - + ((uint8_t *)Skein_512_Process_Block); +} + +uint_t +Skein_512_Unroll_Cnt(void) +{ + return (SKEIN_UNROLL_512); +} +#endif +#endif + +/* Skein1024 */ +#if !(SKEIN_USE_ASM & 1024) +void +Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) +{ + /* do it in C, always looping (unrolled is bigger AND slower!) */ + enum { + WCNT = SKEIN1024_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN1024_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) +#else +#define SKEIN_UNROLL_1024 (0) +#endif + +#if (SKEIN_UNROLL_1024 != 0) +#if (RCNT % SKEIN_UNROLL_1024) +#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ +#endif + size_t r; + /* key schedule words : chaining vars + tweak + "rotation" */ + uint64_t kw[WCNT + 4 + RCNT * 2]; +#else + uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ +#endif + + /* local copy of vars, for speed */ + uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11, + X12, X13, X14, X15; + uint64_t w[WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + /* use for debugging (help compiler put Xn in registers) */ + const uint64_t *Xptr[16]; + Xptr[0] = &X00; + Xptr[1] = &X01; + Xptr[2] = &X02; + Xptr[3] = &X03; + Xptr[4] = &X04; + Xptr[5] = &X05; + Xptr[6] = &X06; + Xptr[7] = &X07; + Xptr[8] = &X08; + Xptr[9] = &X09; + Xptr[10] = &X10; + Xptr[11] = &X11; + Xptr[12] = &X12; + Xptr[13] = &X13; + Xptr[14] = &X14; + Xptr[15] = &X15; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* + * this implementation only supports 2**64 input bytes + * (no carry out here) + */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ctx->X[8]; + ks[9] = ctx->X[9]; + ks[10] = ctx->X[10]; + ks[11] = ctx->X[11]; + ks[12] = ctx->X[12]; + ks[13] = ctx->X[13]; + ks[14] = ctx->X[14]; + ks[15] = ctx->X[15]; + ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ + ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ + ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X00 = w[0] + ks[0]; /* do the first full key injection */ + X01 = w[1] + ks[1]; + X02 = w[2] + ks[2]; + X03 = w[3] + ks[3]; + X04 = w[4] + ks[4]; + X05 = w[5] + ks[5]; + X06 = w[6] + ks[6]; + X07 = w[7] + ks[7]; + X08 = w[8] + ks[8]; + X09 = w[9] + ks[9]; + X10 = w[10] + ks[10]; + X11 = w[11] + ks[11]; + X12 = w[12] + ks[12]; + X13 = w[13] + ks[13] + ts[0]; + X14 = w[14] + ks[14] + ts[1]; + X15 = w[15] + ks[15]; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, + Xptr); + +#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ + pD, pE, pF, ROT, rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ + X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ + X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\ + X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\ + X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\ + X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\ + X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE; + +#if SKEIN_UNROLL_1024 == 0 +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ + pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ + pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); + +#define I1024(R) \ + X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\ + X01 += ks[((R) + 2) % 17]; \ + X02 += ks[((R) + 3) % 17]; \ + X03 += ks[((R) + 4) % 17]; \ + X04 += ks[((R) + 5) % 17]; \ + X05 += ks[((R) + 6) % 17]; \ + X06 += ks[((R) + 7) % 17]; \ + X07 += ks[((R) + 8) % 17]; \ + X08 += ks[((R) + 9) % 17]; \ + X09 += ks[((R) + 10) % 17]; \ + X10 += ks[((R) + 11) % 17]; \ + X11 += ks[((R) + 12) % 17]; \ + X12 += ks[((R) + 13) % 17]; \ + X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ + X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ + X15 += ks[((R) + 16) % 17] + (R) +1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ + pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ + pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); + +#define I1024(R) \ + X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X01 += ks[r + (R) + 1]; \ + X02 += ks[r + (R) + 2]; \ + X03 += ks[r + (R) + 3]; \ + X04 += ks[r + (R) + 4]; \ + X05 += ks[r + (R) + 5]; \ + X06 += ks[r + (R) + 6]; \ + X07 += ks[r + (R) + 7]; \ + X08 += ks[r + (R) + 8]; \ + X09 += ks[r + (R) + 9]; \ + X10 += ks[r + (R) + 10]; \ + X11 += ks[r + (R) + 11]; \ + X12 += ks[r + (R) + 12]; \ + X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ + X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ + X15 += ks[r + (R) + 15] + r + (R); \ + ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\ + ts[r + (R) + 2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + /* loop thru it */ + for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) +#endif + { +#define R1024_8_rounds(R) /* do 8 full rounds */ \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ + 14, 15, R1024_0, 8 * (R) + 1); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ + 08, 01, R1024_1, 8 * (R) + 2); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ + 10, 09, R1024_2, 8 * (R) + 3); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ + 12, 07, R1024_3, 8 * (R) + 4); \ + I1024(2 * (R)); \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ + 14, 15, R1024_4, 8 * (R) + 5); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ + 08, 01, R1024_5, 8 * (R) + 6); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ + 10, 09, R1024_6, 8 * (R) + 7); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ + 12, 07, R1024_7, 8 * (R) + 8); \ + I1024(2 * (R) + 1); + + R1024_8_rounds(0); + +#define R1024_Unroll_R(NN) \ + ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \ + (SKEIN_UNROLL_1024 > (NN))) + +#if R1024_Unroll_R(1) + R1024_8_rounds(1); +#endif +#if R1024_Unroll_R(2) + R1024_8_rounds(2); +#endif +#if R1024_Unroll_R(3) + R1024_8_rounds(3); +#endif +#if R1024_Unroll_R(4) + R1024_8_rounds(4); +#endif +#if R1024_Unroll_R(5) + R1024_8_rounds(5); +#endif +#if R1024_Unroll_R(6) + R1024_8_rounds(6); +#endif +#if R1024_Unroll_R(7) + R1024_8_rounds(7); +#endif +#if R1024_Unroll_R(8) + R1024_8_rounds(8); +#endif +#if R1024_Unroll_R(9) + R1024_8_rounds(9); +#endif +#if R1024_Unroll_R(10) + R1024_8_rounds(10); +#endif +#if R1024_Unroll_R(11) + R1024_8_rounds(11); +#endif +#if R1024_Unroll_R(12) + R1024_8_rounds(12); +#endif +#if R1024_Unroll_R(13) + R1024_8_rounds(13); +#endif +#if R1024_Unroll_R(14) + R1024_8_rounds(14); +#endif +#if (SKEIN_UNROLL_1024 > 14) +#error "need more unrolling in Skein_1024_Process_Block" +#endif + } + /* + * do the final "feedforward" xor, update context chaining vars + */ + + ctx->X[0] = X00 ^ w[0]; + ctx->X[1] = X01 ^ w[1]; + ctx->X[2] = X02 ^ w[2]; + ctx->X[3] = X03 ^ w[3]; + ctx->X[4] = X04 ^ w[4]; + ctx->X[5] = X05 ^ w[5]; + ctx->X[6] = X06 ^ w[6]; + ctx->X[7] = X07 ^ w[7]; + ctx->X[8] = X08 ^ w[8]; + ctx->X[9] = X09 ^ w[9]; + ctx->X[10] = X10 ^ w[10]; + ctx->X[11] = X11 ^ w[11]; + ctx->X[12] = X12 ^ w[12]; + ctx->X[13] = X13 ^ w[13]; + ctx->X[14] = X14 ^ w[14]; + ctx->X[15] = X15 ^ w[15]; + + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + blkPtr += SKEIN1024_BLOCK_BYTES; + } while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t +Skein1024_Process_Block_CodeSize(void) +{ + return ((uint8_t *)Skein1024_Process_Block_CodeSize) - + ((uint8_t *)Skein1024_Process_Block); +} + +uint_t +Skein1024_Unroll_Cnt(void) +{ + return (SKEIN_UNROLL_1024); +} +#endif +#endif diff --git a/module/icp/algs/skein/skein_impl.h b/module/icp/algs/skein/skein_impl.h new file mode 100644 index 000000000..e83a06971 --- /dev/null +++ b/module/icp/algs/skein/skein_impl.h @@ -0,0 +1,289 @@ +/* + * Internal definitions for Skein hashing. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + * + * The following compile-time switches may be defined to control some + * tradeoffs between speed, code size, error checking, and security. + * + * The "default" note explains what happens when the switch is not defined. + * + * SKEIN_DEBUG -- make callouts from inside Skein code + * to examine/display intermediate values. + * [default: no callouts (no overhead)] + * + * SKEIN_ERR_CHECK -- how error checking is handled inside Skein + * code. If not defined, most error checking + * is disabled (for performance). Otherwise, + * the switch value is interpreted as: + * 0: use assert() to flag errors + * 1: return SKEIN_FAIL to flag errors + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#ifndef _SKEIN_IMPL_H_ +#define _SKEIN_IMPL_H_ + +#include <sys/skein.h> +#include "skein_impl.h" +#include "skein_port.h" + +/* determine where we can get bcopy/bzero declarations */ +#ifdef _KERNEL +#include <sys/systm.h> +#else +#include <strings.h> +#endif + +/* + * "Internal" Skein definitions + * -- not needed for sequential hashing API, but will be + * helpful for other uses of Skein (e.g., tree hash mode). + * -- included here so that they can be shared between + * reference and optimized code. + */ + +/* tweak word T[1]: bit field starting positions */ +/* offset 64 because it's the second word */ +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) + +/* bits 112..118: level in hash tree */ +#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) +/* bit 119: partial final input byte */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) +/* bits 120..125: type field */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) +/* bits 126: first block flag */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) +/* bit 127: final block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) + +/* tweak word T[1]: flag bit definition(s) */ +#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD) + +/* tweak word T[1]: tree level bit field mask */ +#define SKEIN_T1_TREE_LVL_MASK (((uint64_t)0x7F) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LEVEL(n) (((uint64_t)(n)) << SKEIN_T1_POS_TREE_LVL) + +/* tweak word T[1]: block type field */ +#define SKEIN_BLK_TYPE_KEY (0) /* key, for MAC and KDF */ +#define SKEIN_BLK_TYPE_CFG (4) /* configuration block */ +#define SKEIN_BLK_TYPE_PERS (8) /* personalization string */ +#define SKEIN_BLK_TYPE_PK (12) /* public key (for signature hashing) */ +#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ +#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ + +#define SKEIN_T1_BLK_TYPE(T) \ + (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +/* key, for MAC and KDF */ +#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) +/* configuration block */ +#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) +/* personalization string */ +#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) +/* public key (for digital signature hashing) */ +#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) +/* key identifier for KDF */ +#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) +/* nonce for PRNG */ +#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE) +/* message processing */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) +/* output stage */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) +/* field bit mask */ +#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) + +#define SKEIN_T1_BLK_TYPE_CFG_FINAL \ + (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL \ + (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_VERSION (1) + +#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ +#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian) */ +#endif + +#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32)) +#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22) + +#define SKEIN_CFG_STR_LEN (4*8) + +/* bit field definitions in config block treeInfo word */ +#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0) +#define SKEIN_CFG_TREE_NODE_SIZE_POS (8) +#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) + +#define SKEIN_CFG_TREE_LEAF_SIZE_MSK \ + (((uint64_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) +#define SKEIN_CFG_TREE_NODE_SIZE_MSK \ + (((uint64_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) +#define SKEIN_CFG_TREE_MAX_LEVEL_MSK \ + (((uint64_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) + +#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl) \ + ((((uint64_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ + (((uint64_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ + (((uint64_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS)) + +/* use as treeInfo in InitExt() call for sequential processing */ +#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0) + +/* + * Skein macros for getting/setting tweak words, etc. + * These are useful for partial input bytes, hash tree init/update, etc. + */ +#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) +#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ + do { \ + (ctxPtr)->h.T[TWK_NUM] = (tVal); \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0) +#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1) +#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0) +#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1) + +/* set both tweak words at once */ +#define Skein_Set_T0_T1(ctxPtr, T0, T1) \ + do { \ + Skein_Set_T0(ctxPtr, (T0)); \ + Skein_Set_T1(ctxPtr, (T1)); \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Set_Type(ctxPtr, BLK_TYPE) \ + Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE) + +/* + * set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; + */ +#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \ + do { \ + Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | \ + SKEIN_T1_BLK_TYPE_ ## BLK_TYPE); \ + (ctxPtr)->h.bCnt = 0; \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Clear_First_Flag(hdr) \ + do { \ + (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \ + _NOTE(CONSTCOND) \ + } while (0) +#define Skein_Set_Bit_Pad_Flag(hdr) \ + do { \ + (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Set_Tree_Level(hdr, height) \ + do { \ + (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \ + _NOTE(CONSTCOND) \ + } while (0) + +/* + * "Internal" Skein definitions for debugging and error checking + * Note: in Illumos we always disable debugging features. + */ +#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr) +#define Skein_Show_Round(bits, ctx, r, X) +#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr) +#define Skein_Show_Final(bits, ctx, cnt, outPtr) +#define Skein_Show_Key(bits, ctx, key, keyBytes) + +/* run-time checks (e.g., bad params, uninitialized context)? */ +#ifndef SKEIN_ERR_CHECK +/* default: ignore all Asserts, for performance */ +#define Skein_Assert(x, retCode) +#define Skein_assert(x) +#elif defined(SKEIN_ASSERT) +#include <sys/debug.h> +#define Skein_Assert(x, retCode) ASSERT(x) +#define Skein_assert(x) ASSERT(x) +#else +#include <sys/debug.h> +/* caller error */ +#define Skein_Assert(x, retCode) \ + do { \ + if (!(x)) \ + return (retCode); \ + _NOTE(CONSTCOND) \ + } while (0) +/* internal error */ +#define Skein_assert(x) ASSERT(x) +#endif + +/* + * Skein block function constants (shared across Ref and Opt code) + */ +enum { + /* Skein_256 round rotation constants */ + R_256_0_0 = 14, R_256_0_1 = 16, + R_256_1_0 = 52, R_256_1_1 = 57, + R_256_2_0 = 23, R_256_2_1 = 40, + R_256_3_0 = 5, R_256_3_1 = 37, + R_256_4_0 = 25, R_256_4_1 = 33, + R_256_5_0 = 46, R_256_5_1 = 12, + R_256_6_0 = 58, R_256_6_1 = 22, + R_256_7_0 = 32, R_256_7_1 = 32, + + /* Skein_512 round rotation constants */ + R_512_0_0 = 46, R_512_0_1 = 36, R_512_0_2 = 19, R_512_0_3 = 37, + R_512_1_0 = 33, R_512_1_1 = 27, R_512_1_2 = 14, R_512_1_3 = 42, + R_512_2_0 = 17, R_512_2_1 = 49, R_512_2_2 = 36, R_512_2_3 = 39, + R_512_3_0 = 44, R_512_3_1 = 9, R_512_3_2 = 54, R_512_3_3 = 56, + R_512_4_0 = 39, R_512_4_1 = 30, R_512_4_2 = 34, R_512_4_3 = 24, + R_512_5_0 = 13, R_512_5_1 = 50, R_512_5_2 = 10, R_512_5_3 = 17, + R_512_6_0 = 25, R_512_6_1 = 29, R_512_6_2 = 39, R_512_6_3 = 43, + R_512_7_0 = 8, R_512_7_1 = 35, R_512_7_2 = 56, R_512_7_3 = 22, + + /* Skein1024 round rotation constants */ + R1024_0_0 = 24, R1024_0_1 = 13, R1024_0_2 = 8, R1024_0_3 = + 47, R1024_0_4 = 8, R1024_0_5 = 17, R1024_0_6 = 22, R1024_0_7 = 37, + R1024_1_0 = 38, R1024_1_1 = 19, R1024_1_2 = 10, R1024_1_3 = + 55, R1024_1_4 = 49, R1024_1_5 = 18, R1024_1_6 = 23, R1024_1_7 = 52, + R1024_2_0 = 33, R1024_2_1 = 4, R1024_2_2 = 51, R1024_2_3 = + 13, R1024_2_4 = 34, R1024_2_5 = 41, R1024_2_6 = 59, R1024_2_7 = 17, + R1024_3_0 = 5, R1024_3_1 = 20, R1024_3_2 = 48, R1024_3_3 = + 41, R1024_3_4 = 47, R1024_3_5 = 28, R1024_3_6 = 16, R1024_3_7 = 25, + R1024_4_0 = 41, R1024_4_1 = 9, R1024_4_2 = 37, R1024_4_3 = + 31, R1024_4_4 = 12, R1024_4_5 = 47, R1024_4_6 = 44, R1024_4_7 = 30, + R1024_5_0 = 16, R1024_5_1 = 34, R1024_5_2 = 56, R1024_5_3 = + 51, R1024_5_4 = 4, R1024_5_5 = 53, R1024_5_6 = 42, R1024_5_7 = 41, + R1024_6_0 = 31, R1024_6_1 = 44, R1024_6_2 = 47, R1024_6_3 = + 46, R1024_6_4 = 19, R1024_6_5 = 42, R1024_6_6 = 44, R1024_6_7 = 25, + R1024_7_0 = 9, R1024_7_1 = 48, R1024_7_2 = 35, R1024_7_3 = + 52, R1024_7_4 = 23, R1024_7_5 = 31, R1024_7_6 = 37, R1024_7_7 = 20 +}; + +/* number of rounds for the different block sizes */ +#define SKEIN_256_ROUNDS_TOTAL (72) +#define SKEIN_512_ROUNDS_TOTAL (72) +#define SKEIN1024_ROUNDS_TOTAL (80) + + +extern const uint64_t SKEIN_256_IV_128[]; +extern const uint64_t SKEIN_256_IV_160[]; +extern const uint64_t SKEIN_256_IV_224[]; +extern const uint64_t SKEIN_256_IV_256[]; +extern const uint64_t SKEIN_512_IV_128[]; +extern const uint64_t SKEIN_512_IV_160[]; +extern const uint64_t SKEIN_512_IV_224[]; +extern const uint64_t SKEIN_512_IV_256[]; +extern const uint64_t SKEIN_512_IV_384[]; +extern const uint64_t SKEIN_512_IV_512[]; +extern const uint64_t SKEIN1024_IV_384[]; +extern const uint64_t SKEIN1024_IV_512[]; +extern const uint64_t SKEIN1024_IV_1024[]; + +#endif /* _SKEIN_IMPL_H_ */ diff --git a/module/icp/algs/skein/skein_iv.c b/module/icp/algs/skein/skein_iv.c new file mode 100644 index 000000000..140d38f76 --- /dev/null +++ b/module/icp/algs/skein/skein_iv.c @@ -0,0 +1,185 @@ +/* + * Pre-computed Skein IVs + * + * NOTE: these values are not "magic" constants, but + * are generated using the Threefish block function. + * They are pre-computed here only for speed; i.e., to + * avoid the need for a Threefish call during Init(). + * + * The IV for any fixed hash length may be pre-computed. + * Only the most common values are included here. + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ +/* + * Illumos implementation note: these constants are for Skein v1.3 as per: + * http://www.skein-hash.info/sites/default/files/skein1.3.pdf + */ + +#include <sys/skein.h> /* get Skein macros and types */ +#include "skein_impl.h" /* get internal definitions */ + +#define MK_64 SKEIN_MK_64 + +/* blkSize = 256 bits. hashSize = 128 bits */ +const uint64_t SKEIN_256_IV_128[] = { + MK_64(0xE1111906, 0x964D7260), + MK_64(0x883DAAA7, 0x7C8D811C), + MK_64(0x10080DF4, 0x91960F7A), + MK_64(0xCCF7DDE5, 0xB45BC1C2) +}; + +/* blkSize = 256 bits. hashSize = 160 bits */ +const uint64_t SKEIN_256_IV_160[] = { + MK_64(0x14202314, 0x72825E98), + MK_64(0x2AC4E9A2, 0x5A77E590), + MK_64(0xD47A5856, 0x8838D63E), + MK_64(0x2DD2E496, 0x8586AB7D) +}; + +/* blkSize = 256 bits. hashSize = 224 bits */ +const uint64_t SKEIN_256_IV_224[] = { + MK_64(0xC6098A8C, 0x9AE5EA0B), + MK_64(0x876D5686, 0x08C5191C), + MK_64(0x99CB88D7, 0xD7F53884), + MK_64(0x384BDDB1, 0xAEDDB5DE) +}; + +/* blkSize = 256 bits. hashSize = 256 bits */ +const uint64_t SKEIN_256_IV_256[] = { + MK_64(0xFC9DA860, 0xD048B449), + MK_64(0x2FCA6647, 0x9FA7D833), + MK_64(0xB33BC389, 0x6656840F), + MK_64(0x6A54E920, 0xFDE8DA69) +}; + +/* blkSize = 512 bits. hashSize = 128 bits */ +const uint64_t SKEIN_512_IV_128[] = { + MK_64(0xA8BC7BF3, 0x6FBF9F52), + MK_64(0x1E9872CE, 0xBD1AF0AA), + MK_64(0x309B1790, 0xB32190D3), + MK_64(0xBCFBB854, 0x3F94805C), + MK_64(0x0DA61BCD, 0x6E31B11B), + MK_64(0x1A18EBEA, 0xD46A32E3), + MK_64(0xA2CC5B18, 0xCE84AA82), + MK_64(0x6982AB28, 0x9D46982D) +}; + +/* blkSize = 512 bits. hashSize = 160 bits */ +const uint64_t SKEIN_512_IV_160[] = { + MK_64(0x28B81A2A, 0xE013BD91), + MK_64(0xC2F11668, 0xB5BDF78F), + MK_64(0x1760D8F3, 0xF6A56F12), + MK_64(0x4FB74758, 0x8239904F), + MK_64(0x21EDE07F, 0x7EAF5056), + MK_64(0xD908922E, 0x63ED70B8), + MK_64(0xB8EC76FF, 0xECCB52FA), + MK_64(0x01A47BB8, 0xA3F27A6E) +}; + +/* blkSize = 512 bits. hashSize = 224 bits */ +const uint64_t SKEIN_512_IV_224[] = { + MK_64(0xCCD06162, 0x48677224), + MK_64(0xCBA65CF3, 0xA92339EF), + MK_64(0x8CCD69D6, 0x52FF4B64), + MK_64(0x398AED7B, 0x3AB890B4), + MK_64(0x0F59D1B1, 0x457D2BD0), + MK_64(0x6776FE65, 0x75D4EB3D), + MK_64(0x99FBC70E, 0x997413E9), + MK_64(0x9E2CFCCF, 0xE1C41EF7) +}; + +/* blkSize = 512 bits. hashSize = 256 bits */ +const uint64_t SKEIN_512_IV_256[] = { + MK_64(0xCCD044A1, 0x2FDB3E13), + MK_64(0xE8359030, 0x1A79A9EB), + MK_64(0x55AEA061, 0x4F816E6F), + MK_64(0x2A2767A4, 0xAE9B94DB), + MK_64(0xEC06025E, 0x74DD7683), + MK_64(0xE7A436CD, 0xC4746251), + MK_64(0xC36FBAF9, 0x393AD185), + MK_64(0x3EEDBA18, 0x33EDFC13) +}; + +/* blkSize = 512 bits. hashSize = 384 bits */ +const uint64_t SKEIN_512_IV_384[] = { + MK_64(0xA3F6C6BF, 0x3A75EF5F), + MK_64(0xB0FEF9CC, 0xFD84FAA4), + MK_64(0x9D77DD66, 0x3D770CFE), + MK_64(0xD798CBF3, 0xB468FDDA), + MK_64(0x1BC4A666, 0x8A0E4465), + MK_64(0x7ED7D434, 0xE5807407), + MK_64(0x548FC1AC, 0xD4EC44D6), + MK_64(0x266E1754, 0x6AA18FF8) +}; + +/* blkSize = 512 bits. hashSize = 512 bits */ +const uint64_t SKEIN_512_IV_512[] = { + MK_64(0x4903ADFF, 0x749C51CE), + MK_64(0x0D95DE39, 0x9746DF03), + MK_64(0x8FD19341, 0x27C79BCE), + MK_64(0x9A255629, 0xFF352CB1), + MK_64(0x5DB62599, 0xDF6CA7B0), + MK_64(0xEABE394C, 0xA9D5C3F4), + MK_64(0x991112C7, 0x1A75B523), + MK_64(0xAE18A40B, 0x660FCC33) +}; + +/* blkSize = 1024 bits. hashSize = 384 bits */ +const uint64_t SKEIN1024_IV_384[] = { + MK_64(0x5102B6B8, 0xC1894A35), + MK_64(0xFEEBC9E3, 0xFE8AF11A), + MK_64(0x0C807F06, 0xE32BED71), + MK_64(0x60C13A52, 0xB41A91F6), + MK_64(0x9716D35D, 0xD4917C38), + MK_64(0xE780DF12, 0x6FD31D3A), + MK_64(0x797846B6, 0xC898303A), + MK_64(0xB172C2A8, 0xB3572A3B), + MK_64(0xC9BC8203, 0xA6104A6C), + MK_64(0x65909338, 0xD75624F4), + MK_64(0x94BCC568, 0x4B3F81A0), + MK_64(0x3EBBF51E, 0x10ECFD46), + MK_64(0x2DF50F0B, 0xEEB08542), + MK_64(0x3B5A6530, 0x0DBC6516), + MK_64(0x484B9CD2, 0x167BBCE1), + MK_64(0x2D136947, 0xD4CBAFEA) +}; + +/* blkSize = 1024 bits. hashSize = 512 bits */ +const uint64_t SKEIN1024_IV_512[] = { + MK_64(0xCAEC0E5D, 0x7C1B1B18), + MK_64(0xA01B0E04, 0x5F03E802), + MK_64(0x33840451, 0xED912885), + MK_64(0x374AFB04, 0xEAEC2E1C), + MK_64(0xDF25A0E2, 0x813581F7), + MK_64(0xE4004093, 0x8B12F9D2), + MK_64(0xA662D539, 0xC2ED39B6), + MK_64(0xFA8B85CF, 0x45D8C75A), + MK_64(0x8316ED8E, 0x29EDE796), + MK_64(0x053289C0, 0x2E9F91B8), + MK_64(0xC3F8EF1D, 0x6D518B73), + MK_64(0xBDCEC3C4, 0xD5EF332E), + MK_64(0x549A7E52, 0x22974487), + MK_64(0x67070872, 0x5B749816), + MK_64(0xB9CD28FB, 0xF0581BD1), + MK_64(0x0E2940B8, 0x15804974) +}; + +/* blkSize = 1024 bits. hashSize = 1024 bits */ +const uint64_t SKEIN1024_IV_1024[] = { + MK_64(0xD593DA07, 0x41E72355), + MK_64(0x15B5E511, 0xAC73E00C), + MK_64(0x5180E5AE, 0xBAF2C4F0), + MK_64(0x03BD41D3, 0xFCBCAFAF), + MK_64(0x1CAEC6FD, 0x1983A898), + MK_64(0x6E510B8B, 0xCDD0589F), + MK_64(0x77E2BDFD, 0xC6394ADA), + MK_64(0xC11E1DB5, 0x24DCB0A3), + MK_64(0xD6D14AF9, 0xC6329AB5), + MK_64(0x6A9B0BFC, 0x6EB67E0D), + MK_64(0x9243C60D, 0xCCFF1332), + MK_64(0x1A1F1DDE, 0x743F02D4), + MK_64(0x0996753C, 0x10ED0BB8), + MK_64(0x6572DD22, 0xF2B4969A), + MK_64(0x61FD3062, 0xD00A579A), + MK_64(0x1DE0536E, 0x8682E539) +}; diff --git a/module/icp/algs/skein/skein_port.h b/module/icp/algs/skein/skein_port.h new file mode 100644 index 000000000..1b0225236 --- /dev/null +++ b/module/icp/algs/skein/skein_port.h @@ -0,0 +1,128 @@ +/* + * Platform-specific definitions for Skein hash function. + * + * Source code author: Doug Whiting, 2008. + * + * This algorithm and source code is released to the public domain. + * + * Many thanks to Brian Gladman for his portable header files. + * + * To port Skein to an "unsupported" platform, change the definitions + * in this file appropriately. + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#ifndef _SKEIN_PORT_H_ +#define _SKEIN_PORT_H_ + +#include <sys/types.h> /* get integer type definitions */ +#include <sys/systm.h> /* for bcopy() */ + +#ifndef RotL_64 +#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N)))) +#endif + +/* + * Skein is "natively" little-endian (unlike SHA-xxx), for optimal + * performance on x86 CPUs. The Skein code requires the following + * definitions for dealing with endianness: + * + * SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian + * Skein_Put64_LSB_First + * Skein_Get64_LSB_First + * Skein_Swap64 + * + * If SKEIN_NEED_SWAP is defined at compile time, it is used here + * along with the portable versions of Put64/Get64/Swap64, which + * are slow in general. + * + * Otherwise, an "auto-detect" of endianness is attempted below. + * If the default handling doesn't work well, the user may insert + * platform-specific code instead (e.g., for big-endian CPUs). + * + */ +#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */ + +#include <sys/isa_defs.h> /* get endianness selection */ + +#define PLATFORM_MUST_ALIGN _ALIGNMENT_REQUIRED +#if defined(_BIG_ENDIAN) +/* here for big-endian CPUs */ +#define SKEIN_NEED_SWAP (1) +#else +/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ +#define SKEIN_NEED_SWAP (0) +#if PLATFORM_MUST_ALIGN == 0 /* ok to use "fast" versions? */ +#define Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt) +#define Skein_Get64_LSB_First(dst64, src08, wCnt) \ + bcopy(src08, dst64, 8 * (wCnt)) +#endif +#endif + +#endif /* ifndef SKEIN_NEED_SWAP */ + +/* + * Provide any definitions still needed. + */ +#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ +#if SKEIN_NEED_SWAP +#define Skein_Swap64(w64) \ + (((((uint64_t)(w64)) & 0xFF) << 56) | \ + (((((uint64_t)(w64)) >> 8) & 0xFF) << 48) | \ + (((((uint64_t)(w64)) >> 16) & 0xFF) << 40) | \ + (((((uint64_t)(w64)) >> 24) & 0xFF) << 32) | \ + (((((uint64_t)(w64)) >> 32) & 0xFF) << 24) | \ + (((((uint64_t)(w64)) >> 40) & 0xFF) << 16) | \ + (((((uint64_t)(w64)) >> 48) & 0xFF) << 8) | \ + (((((uint64_t)(w64)) >> 56) & 0xFF))) +#else +#define Skein_Swap64(w64) (w64) +#endif +#endif /* ifndef Skein_Swap64 */ + +#ifndef Skein_Put64_LSB_First +void +Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ +{ + /* + * this version is fully portable (big-endian or little-endian), + * but slow + */ + size_t n; + + for (n = 0; n < bCnt; n++) + dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7))); +} +#else +; /* output only the function prototype */ +#endif +#endif /* ifndef Skein_Put64_LSB_First */ + +#ifndef Skein_Get64_LSB_First +void +Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt) +#ifdef SKEIN_PORT_CODE /* instantiate the function code here? */ +{ + /* + * this version is fully portable (big-endian or little-endian), + * but slow + */ + size_t n; + + for (n = 0; n < 8 * wCnt; n += 8) + dst[n / 8] = (((uint64_t)src[n])) + + (((uint64_t)src[n + 1]) << 8) + + (((uint64_t)src[n + 2]) << 16) + + (((uint64_t)src[n + 3]) << 24) + + (((uint64_t)src[n + 4]) << 32) + + (((uint64_t)src[n + 5]) << 40) + + (((uint64_t)src[n + 6]) << 48) + + (((uint64_t)src[n + 7]) << 56); +} +#else +; /* output only the function prototype */ +#endif +#endif /* ifndef Skein_Get64_LSB_First */ + +#endif /* _SKEIN_PORT_H_ */ diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S index b689c9022..d55c5eb48 100644 --- a/module/icp/asm-x86_64/sha2/sha256_impl.S +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -62,11 +62,9 @@ */ /* - * This file was generated by a perl script (sha512-x86_64.pl) that could - * be used to generate sha256 and sha512 variants from the same code base. - * For our purposes, we only need sha256 and so getting the perl script to - * run as part of the build process seemed superfluous. The comments from - * the original file have been pasted above. + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. */ #if defined(lint) || defined(__lint) diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S new file mode 100644 index 000000000..24a41745b --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -0,0 +1,2083 @@ +/* + * ==================================================================== + * Written by Andy Polyakov <[email protected]> for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + + +#if defined(lint) || defined(__lint) +#include <sys/stdint.h> +#include <sha2/sha2.h> + +/* ARGSUSED */ +void +SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include <sys/asm_linkage.h> + +ENTRY_NP(SHA512TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*8+4*8,%rsp + lea (%rsi,%rdx,8),%rdx # inp+num*16*8 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*8+3*8(%rsp) # save copy of %rsp + + /.picmeup %rbp + / The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + / the address of the "next" instruction into the target register + / (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + /nop / .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K512-.(%rbp),%rbp + + mov 8*0(%rdi),%rax + mov 8*1(%rdi),%rbx + mov 8*2(%rdi),%rcx + mov 8*3(%rdi),%rdx + mov 8*4(%rdi),%r8 + mov 8*5(%rdi),%r9 + mov 8*6(%rdi),%r10 + mov 8*7(%rdi),%r11 + jmp .Lloop + +.align 16 +.Lloop: + xor %rdi,%rdi + mov 8*0(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*1(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*2(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*3(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*4(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*5(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*6(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*7(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 8*8(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*9(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*10(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*11(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*12(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*13(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*14(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*15(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + mov 8(%rsp),%r13 + mov 112(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 72(%rsp),%r12 + + add 0(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 16(%rsp),%r13 + mov 120(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 80(%rsp),%r12 + + add 8(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 24(%rsp),%r13 + mov 0(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 88(%rsp),%r12 + + add 16(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 32(%rsp),%r13 + mov 8(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 96(%rsp),%r12 + + add 24(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 40(%rsp),%r13 + mov 16(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 104(%rsp),%r12 + + add 32(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 48(%rsp),%r13 + mov 24(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 112(%rsp),%r12 + + add 40(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 56(%rsp),%r13 + mov 32(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 120(%rsp),%r12 + + add 48(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 64(%rsp),%r13 + mov 40(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 0(%rsp),%r12 + + add 56(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 72(%rsp),%r13 + mov 48(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 8(%rsp),%r12 + + add 64(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 80(%rsp),%r13 + mov 56(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 16(%rsp),%r12 + + add 72(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 88(%rsp),%r13 + mov 64(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 24(%rsp),%r12 + + add 80(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 96(%rsp),%r13 + mov 72(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 32(%rsp),%r12 + + add 88(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 104(%rsp),%r13 + mov 80(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 40(%rsp),%r12 + + add 96(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 112(%rsp),%r13 + mov 88(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 48(%rsp),%r12 + + add 104(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 120(%rsp),%r13 + mov 96(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 56(%rsp),%r12 + + add 112(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 0(%rsp),%r13 + mov 104(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 64(%rsp),%r12 + + add 120(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + cmp $80,%rdi + jb .Lrounds_16_xx + + mov 16*8+0*8(%rsp),%rdi + lea 16*8(%rsi),%rsi + + add 8*0(%rdi),%rax + add 8*1(%rdi),%rbx + add 8*2(%rdi),%rcx + add 8*3(%rdi),%rdx + add 8*4(%rdi),%r8 + add 8*5(%rdi),%r9 + add 8*6(%rdi),%r10 + add 8*7(%rdi),%r11 + + cmp 16*8+2*8(%rsp),%rsi + + mov %rax,8*0(%rdi) + mov %rbx,8*1(%rdi) + mov %rcx,8*2(%rdi) + mov %rdx,8*3(%rdi) + mov %r8,8*4(%rdi) + mov %r9,8*5(%rdi) + mov %r10,8*6(%rdi) + mov %r11,8*7(%rdi) + jb .Lloop + + mov 16*8+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA512TransformBlocks) + +.align 64 +.type K512,@object +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +#endif /* !lint && !__lint */ diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c index 7dd5dbf42..aa63e431f 100644 --- a/module/icp/illumos-crypto.c +++ b/module/icp/illumos-crypto.c @@ -109,8 +109,10 @@ void __exit icp_fini(void) { + skein_mod_fini(); sha2_mod_fini(); sha1_mod_fini(); + edonr_mod_fini(); aes_mod_fini(); kcf_sched_destroy(); kcf_prov_tab_destroy(); @@ -139,8 +141,10 @@ icp_init(void) /* initialize algorithms */ aes_mod_init(); + edonr_mod_init(); sha1_mod_init(); sha2_mod_init(); + skein_mod_init(); return (0); } diff --git a/module/icp/include/sha2/sha2.h b/module/icp/include/sha2/sha2.h deleted file mode 100644 index 8e53987a7..000000000 --- a/module/icp/include/sha2/sha2.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* Copyright 2013 Saso Kiselkov. All rights reserved. */ - -#ifndef _SYS_SHA2_H -#define _SYS_SHA2_H - -#include <sys/types.h> /* for uint_* */ - -#ifdef __cplusplus -extern "C" { -#endif - -#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ -#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ - -#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ - -#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ - -#define SHA256 0 -#define SHA256_HMAC 1 -#define SHA256_HMAC_GEN 2 - -/* - * SHA2 context. - * The contents of this structure are a private interface between the - * Init/Update/Final calls of the functions defined below. - * Callers must never attempt to read or write any of the fields - * in this structure directly. - */ -typedef struct { - uint32_t algotype; /* Algorithm Type */ - - /* state (ABCDEFGH) */ - union { - uint32_t s32[8]; /* for SHA256 */ - uint64_t s64[8]; /* for SHA384/512 */ - } state; - /* number of bits */ - union { - uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ - uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ - } count; - union { - uint8_t buf8[128]; /* undigested input */ - uint32_t buf32[32]; /* realigned input */ - uint64_t buf64[16]; /* realigned input */ - } buf_un; -} SHA2_CTX; - -typedef SHA2_CTX SHA256_CTX; -typedef SHA2_CTX SHA384_CTX; -typedef SHA2_CTX SHA512_CTX; - -extern void SHA2Init(uint64_t mech, SHA2_CTX *); - -extern void SHA2Update(SHA2_CTX *, const void *, size_t); - -extern void SHA2Final(void *, SHA2_CTX *); - -extern void SHA256Init(SHA256_CTX *); - -extern void SHA256Update(SHA256_CTX *, const void *, size_t); - -extern void SHA256Final(void *, SHA256_CTX *); - -#ifdef _SHA2_IMPL -/* - * The following types/functions are all private to the implementation - * of the SHA2 functions and must not be used by consumers of the interface - */ - -/* - * List of support mechanisms in this module. - * - * It is important to note that in the module, division or modulus calculations - * are used on the enumerated type to determine which mechanism is being used; - * therefore, changing the order or additional mechanisms should be done - * carefully - */ -typedef enum sha2_mech_type { - SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ - SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ - SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ -} sha2_mech_type_t; - -#endif /* _SHA2_IMPL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_SHA2_H */ diff --git a/module/icp/include/sha2/sha2_impl.h b/module/icp/include/sha2/sha2_impl.h index bb42c3cd4..b9768d344 100644 --- a/module/icp/include/sha2/sha2_impl.h +++ b/module/icp/include/sha2/sha2_impl.h @@ -26,6 +26,8 @@ #ifndef _SHA2_IMPL_H #define _SHA2_IMPL_H +#include <sys/sha2.h> + #ifdef __cplusplus extern "C" { #endif diff --git a/module/icp/io/edonr_mod.c b/module/icp/io/edonr_mod.c new file mode 100644 index 000000000..19b5c963d --- /dev/null +++ b/module/icp/io/edonr_mod.c @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#include <sys/modctl.h> +#include <sys/crypto/common.h> +#include <sys/crypto/spi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#include <sys/edonr.h> + +/* + * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic + * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose + * cryptographic use. Users of Edon-R must interface directly to this module. + */ + +static struct modlmisc modlmisc = { + &mod_cryptoops, + "Edon-R Message-Digest Algorithm" +}; + +static struct modlinkage modlinkage = { + MODREV_1, {&modlmisc, NULL} +}; + +int +edonr_mod_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) + return (error); + + return (0); +} + +int +edonr_mod_fini(void) { + return (mod_remove(&modlinkage)); +} diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c index be0f7a42c..3913d7618 100644 --- a/module/icp/io/sha2_mod.c +++ b/module/icp/io/sha2_mod.c @@ -30,7 +30,7 @@ #include <sys/crypto/spi.h> #include <sys/crypto/icp.h> #define _SHA2_IMPL -#include <sha2/sha2.h> +#include <sys/sha2.h> #include <sha2/sha2_impl.h> /* diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c new file mode 100644 index 000000000..e909a7e31 --- /dev/null +++ b/module/icp/io/skein_mod.c @@ -0,0 +1,721 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#include <sys/modctl.h> +#include <sys/crypto/common.h> +#include <sys/crypto/spi.h> +#include <sys/sysmacros.h> +#include <sys/systm.h> +#define SKEIN_MODULE_IMPL +#include <sys/skein.h> + +/* + * Like the sha2 module, we create the skein module with two modlinkages: + * - modlmisc to allow direct calls to Skein_* API functions. + * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF). + */ +static struct modlmisc modlmisc = { + &mod_cryptoops, + "Skein Message-Digest Algorithm" +}; + +static struct modlcrypto modlcrypto = { + &mod_cryptoops, + "Skein Kernel SW Provider" +}; + +static struct modlinkage modlinkage = { + MODREV_1, {&modlmisc, &modlcrypto, NULL} +}; + +static crypto_mech_info_t skein_mech_info_tab[] = { + {CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + {CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + {CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + {CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + {CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + {CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, + CRYPTO_KEYSIZE_UNIT_IN_BYTES} +}; + +static void skein_provider_status(crypto_provider_handle_t, uint_t *); + +static crypto_control_ops_t skein_control_ops = { + skein_provider_status +}; + +static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_req_handle_t); +static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); +static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); +static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); +static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); + +static crypto_digest_ops_t skein_digest_ops = { + skein_digest_init, + skein_digest, + skein_update, + NULL, + skein_final, + skein_digest_atomic +}; + +static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + +static crypto_mac_ops_t skein_mac_ops = { + skein_mac_init, + NULL, + skein_update, /* using regular digest update is OK here */ + skein_final, /* using regular digest final is OK here */ + skein_mac_atomic, + NULL +}; + +static int skein_create_ctx_template(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, + size_t *, crypto_req_handle_t); +static int skein_free_context(crypto_ctx_t *); + +static crypto_ctx_ops_t skein_ctx_ops = { + skein_create_ctx_template, + skein_free_context +}; + +static crypto_ops_t skein_crypto_ops = {{{{{ + &skein_control_ops, + &skein_digest_ops, + NULL, + &skein_mac_ops, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &skein_ctx_ops, +}}}}}; + +static crypto_provider_info_t skein_prov_info = {{{{ + CRYPTO_SPI_VERSION_1, + "Skein Software Provider", + CRYPTO_SW_PROVIDER, + NULL, + &skein_crypto_ops, + sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t), + skein_mech_info_tab +}}}}; + +static crypto_kcf_provider_handle_t skein_prov_handle = 0; + +typedef struct skein_ctx { + skein_mech_type_t sc_mech_type; + size_t sc_digest_bitlen; + /*LINTED(E_ANONYMOUS_UNION_DECL)*/ + union { + Skein_256_Ctxt_t sc_256; + Skein_512_Ctxt_t sc_512; + Skein1024_Ctxt_t sc_1024; + }; +} skein_ctx_t; +#define SKEIN_CTX(_ctx_) ((skein_ctx_t *)((_ctx_)->cc_provider_private)) +#define SKEIN_CTX_LVALUE(_ctx_) (_ctx_)->cc_provider_private +#define SKEIN_OP(_skein_ctx, _op, ...) \ + do { \ + skein_ctx_t *sc = (_skein_ctx); \ + switch (sc->sc_mech_type) { \ + case SKEIN_256_MECH_INFO_TYPE: \ + case SKEIN_256_MAC_MECH_INFO_TYPE: \ + (void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\ + break; \ + case SKEIN_512_MECH_INFO_TYPE: \ + case SKEIN_512_MAC_MECH_INFO_TYPE: \ + (void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\ + break; \ + case SKEIN1024_MECH_INFO_TYPE: \ + case SKEIN1024_MAC_MECH_INFO_TYPE: \ + (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\ + break; \ + } \ + _NOTE(CONSTCOND) \ + } while (0) + +static int +skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result) +{ + if (mechanism->cm_param != NULL) { + /*LINTED(E_BAD_PTR_CAST_ALIGN)*/ + skein_param_t *param = (skein_param_t *)mechanism->cm_param; + + if (mechanism->cm_param_len != sizeof (*param) || + param->sp_digest_bitlen == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + *result = param->sp_digest_bitlen; + } else { + switch (mechanism->cm_type) { + case SKEIN_256_MECH_INFO_TYPE: + *result = 256; + break; + case SKEIN_512_MECH_INFO_TYPE: + *result = 512; + break; + case SKEIN1024_MECH_INFO_TYPE: + *result = 1024; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + } + return (CRYPTO_SUCCESS); +} + +int +skein_mod_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) + return (error); + + /* + * Try to register with KCF - failure shouldn't unload us, since we + * still may want to continue providing misc/skein functionality. + */ + (void) crypto_register_provider(&skein_prov_info, &skein_prov_handle); + + return (0); +} + +int +skein_mod_fini(void) { + return (mod_remove(&modlinkage)); +} + +/* + * KCF software provider control entry points. + */ +/* ARGSUSED */ +static void +skein_provider_status(crypto_provider_handle_t provider, uint_t *status) +{ + *status = CRYPTO_PROVIDER_READY; +} + +/* + * General Skein hashing helper functions. + */ + +/* + * Performs an Update on a context with uio input data. + */ +static int +skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) +{ + off_t offset = data->cd_offset; + size_t length = data->cd_length; + uint_t vec_idx; + size_t cur_len; + const uio_t *uio = data->cd_uio; + + /* we support only kernel buffer */ + if (uio->uio_segflg != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing data to be + * digested. + */ + for (vec_idx = 0; vec_idx < uio->uio_iovcnt && + offset >= uio->uio_iov[vec_idx].iov_len; + offset -= uio->uio_iov[vec_idx++].iov_len) + ; + if (vec_idx == uio->uio_iovcnt) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + /* + * Now do the digesting on the iovecs. + */ + while (vec_idx < uio->uio_iovcnt && length > 0) { + cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, length); + SKEIN_OP(ctx, Update, (uint8_t *)uio->uio_iov[vec_idx].iov_base + + offset, cur_len); + length -= cur_len; + vec_idx++; + offset = 0; + } + + if (vec_idx == uio->uio_iovcnt && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + return (CRYPTO_SUCCESS); +} + +/* + * Performs a Final on a context and writes to a uio digest output. + */ +static int +skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, + crypto_req_handle_t req) +{ + off_t offset = digest->cd_offset; + uint_t vec_idx; + uio_t *uio = digest->cd_uio; + + /* we support only kernel buffer */ + if (uio->uio_segflg != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing ptr to the digest to be returned. + */ + for (vec_idx = 0; offset >= uio->uio_iov[vec_idx].iov_len && + vec_idx < uio->uio_iovcnt; + offset -= uio->uio_iov[vec_idx++].iov_len) + ; + if (vec_idx == uio->uio_iovcnt) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <= + uio->uio_iov[vec_idx].iov_len) { + /* The computed digest will fit in the current iovec. */ + SKEIN_OP(ctx, Final, + (uchar_t *)uio->uio_iov[vec_idx].iov_base + offset); + } else { + uint8_t *digest_tmp; + off_t scratch_offset = 0; + size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen); + size_t cur_len; + + digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES( + ctx->sc_digest_bitlen), crypto_kmflag(req)); + if (digest_tmp == NULL) + return (CRYPTO_HOST_MEMORY); + SKEIN_OP(ctx, Final, digest_tmp); + while (vec_idx < uio->uio_iovcnt && length > 0) { + cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, + length); + bcopy(digest_tmp + scratch_offset, + uio->uio_iov[vec_idx].iov_base + offset, cur_len); + + length -= cur_len; + vec_idx++; + scratch_offset += cur_len; + offset = 0; + } + kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen)); + + if (vec_idx == uio->uio_iovcnt && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it + * provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + } + + return (CRYPTO_SUCCESS); +} + +/* + * KCF software provider digest entry points. + */ + +/* + * Initializes a skein digest context to the configuration in `mechanism'. + * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param + * field may contain a skein_param_t structure indicating the length of the + * digest the algorithm should produce. Otherwise the default output lengths + * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes + * for Skein-1024). + */ +static int +skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) + return (CRYPTO_MECHANISM_INVALID); + + SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), + crypto_kmflag(req)); + if (SKEIN_CTX(ctx) == NULL) + return (CRYPTO_HOST_MEMORY); + + SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type; + error = skein_get_digest_bitlen(mechanism, + &SKEIN_CTX(ctx)->sc_digest_bitlen); + if (error != CRYPTO_SUCCESS) + goto errout; + SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen); + + return (CRYPTO_SUCCESS); +errout: + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + SKEIN_CTX_LVALUE(ctx) = NULL; + return (error); +} + +/* + * Executes a skein_update and skein_digest on a pre-initialized crypto + * context in a single step. See the documentation to these functions to + * see what to pass here. + */ +static int +skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + ASSERT(SKEIN_CTX(ctx) != NULL); + + if (digest->cd_length < + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { + digest->cd_length = + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); + return (CRYPTO_BUFFER_TOO_SMALL); + } + + error = skein_update(ctx, data, req); + if (error != CRYPTO_SUCCESS) { + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + SKEIN_CTX_LVALUE(ctx) = NULL; + digest->cd_length = 0; + return (error); + } + error = skein_final(ctx, digest, req); + + return (error); +} + +/* + * Performs a skein Update with the input message in `data' (successive calls + * can push more data). This is used both for digest and MAC operation. + * Supported input data formats are raw, uio and mblk. + */ +/*ARGSUSED*/ +static int +skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + ASSERT(SKEIN_CTX(ctx) != NULL); + + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SKEIN_OP(SKEIN_CTX(ctx), Update, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + error = skein_digest_update_uio(SKEIN_CTX(ctx), data); + break; + default: + error = CRYPTO_ARGUMENTS_BAD; + } + + return (error); +} + +/* + * Performs a skein Final, writing the output to `digest'. This is used both + * for digest and MAC operation. + * Supported output digest formats are raw, uio and mblk. + */ +/*ARGSUSED*/ +static int +skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + ASSERT(SKEIN_CTX(ctx) != NULL); + + if (digest->cd_length < + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { + digest->cd_length = + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); + return (CRYPTO_BUFFER_TOO_SMALL); + } + + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SKEIN_OP(SKEIN_CTX(ctx), Final, + (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset); + break; + case CRYPTO_DATA_UIO: + error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req); + break; + default: + error = CRYPTO_ARGUMENTS_BAD; + } + + if (error == CRYPTO_SUCCESS) + digest->cd_length = + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); + else + digest->cd_length = 0; + + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx)))); + SKEIN_CTX_LVALUE(ctx) = NULL; + + return (error); +} + +/* + * Performs a full skein digest computation in a single call, configuring the + * algorithm according to `mechanism', reading the input to be digested from + * `data' and writing the output to `digest'. + * Supported input/output formats are raw, uio and mblk. + */ +/*ARGSUSED*/ +static int +skein_digest_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req) +{ + int error; + skein_ctx_t skein_ctx; + crypto_ctx_t ctx; + SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; + + /* Init */ + if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) + return (CRYPTO_MECHANISM_INVALID); + skein_ctx.sc_mech_type = mechanism->cm_type; + error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen); + if (error != CRYPTO_SUCCESS) + goto out; + SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen); + + if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS) + goto out; + if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS) + goto out; + +out: + if (error == CRYPTO_SUCCESS) + digest->cd_length = + CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen); + else + digest->cd_length = 0; + bzero(&skein_ctx, sizeof (skein_ctx)); + + return (error); +} + +/* + * Helper function that builds a Skein MAC context from the provided + * mechanism and key. + */ +static int +skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key) +{ + int error; + + if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type)) + return (CRYPTO_MECHANISM_INVALID); + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + ctx->sc_mech_type = mechanism->cm_type; + error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen); + if (error != CRYPTO_SUCCESS) + return (error); + SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data, + CRYPTO_BITS2BYTES(key->ck_length)); + + return (CRYPTO_SUCCESS); +} + +/* + * KCF software provide mac entry points. + */ +/* + * Initializes a skein MAC context. You may pass a ctx_template, in which + * case the template will be reused to make initialization more efficient. + * Otherwise a new context will be constructed. The mechanism cm_type must + * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you + * may pass a skein_param_t in cm_param to configure the length of the + * digest. The key must be in raw format. + */ +static int +skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t ctx_template, + crypto_req_handle_t req) +{ + int error; + + SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), + crypto_kmflag(req)); + if (SKEIN_CTX(ctx) == NULL) + return (CRYPTO_HOST_MEMORY); + + if (ctx_template != NULL) { + bcopy(ctx_template, SKEIN_CTX(ctx), + sizeof (*SKEIN_CTX(ctx))); + } else { + error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key); + if (error != CRYPTO_SUCCESS) + goto errout; + } + + return (CRYPTO_SUCCESS); +errout: + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + return (error); +} + +/* + * The MAC update and final calls are reused from the regular digest code. + */ + +/*ARGSUSED*/ +/* + * Same as skein_digest_atomic, performs an atomic Skein MAC operation in + * one step. All the same properties apply to the arguments of this + * function as to those of the partial operations above. + */ +static int +skein_mac_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) +{ + /* faux crypto context just for skein_digest_{update,final} */ + int error; + crypto_ctx_t ctx; + skein_ctx_t skein_ctx; + SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; + + if (ctx_template != NULL) { + bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx)); + } else { + error = skein_mac_ctx_build(&skein_ctx, mechanism, key); + if (error != CRYPTO_SUCCESS) + goto errout; + } + + if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS) + goto errout; + if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS) + goto errout; + + return (CRYPTO_SUCCESS); +errout: + bzero(&skein_ctx, sizeof (skein_ctx)); + return (error); +} + +/* + * KCF software provider context management entry points. + */ + +/* + * Constructs a context template for the Skein MAC algorithm. The same + * properties apply to the arguments of this function as to those of + * skein_mac_init. + */ +/*ARGSUSED*/ +static int +skein_create_ctx_template(crypto_provider_handle_t provider, + crypto_mechanism_t *mechanism, crypto_key_t *key, + crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size, + crypto_req_handle_t req) +{ + int error; + skein_ctx_t *ctx_tmpl; + + ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req)); + if (ctx_tmpl == NULL) + return (CRYPTO_HOST_MEMORY); + error = skein_mac_ctx_build(ctx_tmpl, mechanism, key); + if (error != CRYPTO_SUCCESS) + goto errout; + *ctx_template = ctx_tmpl; + *ctx_template_size = sizeof (*ctx_tmpl); + + return (CRYPTO_SUCCESS); +errout: + bzero(ctx_tmpl, sizeof (*ctx_tmpl)); + kmem_free(ctx_tmpl, sizeof (*ctx_tmpl)); + return (error); +} + +/* + * Frees a skein context in a parent crypto context. + */ +static int +skein_free_context(crypto_ctx_t *ctx) +{ + if (SKEIN_CTX(ctx) != NULL) { + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + SKEIN_CTX_LVALUE(ctx) = NULL; + } + + return (CRYPTO_SUCCESS); +} |