OpenZFS 4185 - add new cryptographic checksums to ZFS: SHA-512, Skein, Edon-R

Reviewed by: George Wilson <[email protected]> Reviewed by: Prakash Surya <[email protected]> Reviewed by: Saso Kiselkov <[email protected]> Reviewed by: Richard Lowe <[email protected]> Approved by: Garrett D'Amore <[email protected]> Ported by: Tony Hutter <[email protected]> OpenZFS-issue: https://www.illumos.org/issues/4185 OpenZFS-commit: https://github.com/openzfs/openzfs/commit/45818ee Porting Notes: This code is ported on top of the Illumos Crypto Framework code: https://github.com/zfsonlinux/zfs/pull/4329/commits/b5e030c8dbb9cd393d313571dee4756fbba8c22d The list of porting changes includes: - Copied module/icp/include/sha2/sha2.h directly from illumos - Removed from module/icp/algs/sha2/sha2.c: #pragma inline(SHA256Init, SHA384Init, SHA512Init) - Added 'ctx' to lib/libzfs/libzfs_sendrecv.c:zio_checksum_SHA256() since it now takes in an extra parameter. - Added CTASSERT() to assert.h from for module/zfs/edonr_zfs.c - Added skein & edonr to libicp/Makefile.am - Added sha512.S. It was generated from sha512-x86_64.pl in Illumos. - Updated ztest.c with new fletcher_4_*() args; used NULL for new CTX argument. - In icp/algs/edonr/edonr_byteorder.h, Removed the #if defined(__linux) section to not #include the non-existant endian.h. - In skein_test.c, renane NULL to 0 in "no test vector" array entries to get around a compiler warning. - Fixup test files: - Rename <sys/varargs.h> -> <varargs.h>, <strings.h> -> <string.h>, - Remove <note.h> and define NOTE() as NOP. - Define u_longlong_t - Rename "#!/usr/bin/ksh" -> "#!/bin/ksh -p" - Rename NULL to 0 in "no test vector" array entries to get around a compiler warning. - Remove "for isa in $($ISAINFO); do" stuff - Add/update Makefiles - Add some userspace headers like stdio.h/stdlib.h in places of sys/types.h. - EXPORT_SYMBOL *_Init/*_Update/*_Final... routines in ICP modules. - Update scripts/zfs2zol-patch.sed - include <sys/sha2.h> in sha2_impl.h - Add sha2.h to include/sys/Makefile.am - Add skein and edonr dirs to icp Makefile - Add new checksums to zpool_get.cfg - Move checksum switch block from zfs_secpolicy_setprop() to zfs_check_settable() - Fix -Wuninitialized error in edonr_byteorder.h on PPC - Fix stack frame size errors on ARM32 - Don't unroll loops in Skein on 32-bit to save stack space - Add memory barriers in sha2.c on 32-bit to save stack space - Add filetest_001_pos.ksh checksum sanity test - Add option to write psudorandom data in file_write utility
author: Tony Hutter <[email protected]> 2016-06-15 15:47:05 -0700
committer: Tony Hutter <[email protected]> 2016-10-03 14:51:15 -0700
commit: 3c67d83a8afb391f20bc53d36a0cebea6897b3e2 (patch)
tree: 2b862986c83414c7359c00219b43ad47dd73f81e /module/icp
parent: 62a65a654e15a1388bfb571727e69b46e7cc07ab (diff)
19 files changed, 6644 insertions, 128 deletions
diff --git a/module/icp/Makefile.in b/module/icp/Makefile.in
index 4be03dbae..b822635b7 100644
--- a/module/icp/Makefile.in
+++ b/module/icp/Makefile.in
@@ -12,6 +12,7 @@ ASM_SOURCES += asm-x86_64/aes/aes_intel.o
 ASM_SOURCES += asm-x86_64/modes/gcm_intel.o
 ASM_SOURCES += asm-x86_64/sha1/sha1-x86_64.o
 ASM_SOURCES += asm-x86_64/sha2/sha256_impl.o
+ASM_SOURCES += asm-x86_64/sha2/sha512_impl.o
 endif
 
 ifeq ($(TARGET_ASM_DIR), asm-i386)
@@ -43,8 +44,10 @@ $(MODULE)-objs += core/kcf_mech_tabs.o
 $(MODULE)-objs += core/kcf_prov_lib.o
 $(MODULE)-objs += spi/kcf_spi.o
 $(MODULE)-objs += io/aes.o
+$(MODULE)-objs += io/edonr_mod.o
 $(MODULE)-objs += io/sha1_mod.o
 $(MODULE)-objs += io/sha2_mod.o
+$(MODULE)-objs += io/skein_mod.o
 $(MODULE)-objs += os/modhash.o
 $(MODULE)-objs += os/modconf.o
 $(MODULE)-objs += algs/modes/cbc.o
@@ -55,8 +58,13 @@ $(MODULE)-objs += algs/modes/gcm.o
 $(MODULE)-objs += algs/modes/modes.o
 $(MODULE)-objs += algs/aes/aes_impl.o
 $(MODULE)-objs += algs/aes/aes_modes.o
+$(MODULE)-objs += algs/edonr/edonr.o
 $(MODULE)-objs += algs/sha1/sha1.o
 $(MODULE)-objs += algs/sha2/sha2.o
+$(MODULE)-objs += algs/sha1/sha1.o
+$(MODULE)-objs += algs/skein/skein.o
+$(MODULE)-objs += algs/skein/skein_block.o
+$(MODULE)-objs += algs/skein/skein_iv.o
 $(MODULE)-objs += $(ASM_SOURCES)
 
 ICP_DIRS = \
@@ -67,9 +75,11 @@ ICP_DIRS = \
 	os \
 	algs \
 	algs/aes \
+	algs/edonr \
 	algs/modes \
 	algs/sha1 \
 	algs/sha2 \
+	algs/skein \
 	asm-x86_64 \
 	asm-x86_64/aes \
 	asm-x86_64/modes \
diff --git a/module/icp/algs/edonr/edonr.c b/module/icp/algs/edonr/edonr.c
new file mode 100644
index 000000000..8ae989890
--- /dev/null
+++ b/module/icp/algs/edonr/edonr.c
@@ -0,0 +1,751 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]>
+ * Tweaked Edon-R implementation for SUPERCOP, based on NIST API.
+ *
+ * $Id: edonr.c 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+/* determine where we can get bcopy/bzero declarations */
+#ifdef	_KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+#include <sys/edonr.h>
+#include <sys/debug.h>
+
+/* big endian support, provides no-op's if run on little endian hosts */
+#include "edonr_byteorder.h"
+
+#define	hashState224(x)	((x)->pipe->p256)
+#define	hashState256(x)	((x)->pipe->p256)
+#define	hashState384(x)	((x)->pipe->p512)
+#define	hashState512(x)	((x)->pipe->p512)
+
+/* shift and rotate shortcuts */
+#define	shl(x, n)	((x) << n)
+#define	shr(x, n)	((x) >> n)
+
+#define	rotl32(x, n)	(((x) << (n)) | ((x) >> (32 - (n))))
+#define	rotr32(x, n)	(((x) >> (n)) | ((x) << (32 - (n))))
+
+#define	rotl64(x, n)	(((x) << (n)) | ((x) >> (64 - (n))))
+#define	rotr64(x, n)	(((x) >> (n)) | ((x) << (64 - (n))))
+
+#if !defined(__C99_RESTRICT)
+#define	restrict	/* restrict */
+#endif
+
+#define	EDONR_VALID_HASHBITLEN(x) \
+	((x) == 512 || (x) == 384 || (x) == 256 || (x) == 224)
+
+/* EdonR224 initial double chaining pipe */
+static const uint32_t i224p2[16] = {
+	0x00010203ul, 0x04050607ul, 0x08090a0bul, 0x0c0d0e0ful,
+	0x10111213ul, 0x14151617ul, 0x18191a1bul, 0x1c1d1e1ful,
+	0x20212223ul, 0x24252627ul, 0x28292a2bul, 0x2c2d2e2ful,
+	0x30313233ul, 0x34353637ul, 0x38393a3bul, 0x3c3d3e3ful,
+};
+
+/* EdonR256 initial double chaining pipe */
+static const uint32_t i256p2[16] = {
+	0x40414243ul, 0x44454647ul, 0x48494a4bul, 0x4c4d4e4ful,
+	0x50515253ul, 0x54555657ul, 0x58595a5bul, 0x5c5d5e5ful,
+	0x60616263ul, 0x64656667ul, 0x68696a6bul, 0x6c6d6e6ful,
+	0x70717273ul, 0x74757677ul, 0x78797a7bul, 0x7c7d7e7ful,
+};
+
+/* EdonR384 initial double chaining pipe */
+static const uint64_t i384p2[16] = {
+	0x0001020304050607ull, 0x08090a0b0c0d0e0full,
+	0x1011121314151617ull, 0x18191a1b1c1d1e1full,
+	0x2021222324252627ull, 0x28292a2b2c2d2e2full,
+	0x3031323334353637ull, 0x38393a3b3c3d3e3full,
+	0x4041424344454647ull, 0x48494a4b4c4d4e4full,
+	0x5051525354555657ull, 0x58595a5b5c5d5e5full,
+	0x6061626364656667ull, 0x68696a6b6c6d6e6full,
+	0x7071727374757677ull, 0x78797a7b7c7d7e7full
+};
+
+/* EdonR512 initial double chaining pipe */
+static const uint64_t i512p2[16] = {
+	0x8081828384858687ull, 0x88898a8b8c8d8e8full,
+	0x9091929394959697ull, 0x98999a9b9c9d9e9full,
+	0xa0a1a2a3a4a5a6a7ull, 0xa8a9aaabacadaeafull,
+	0xb0b1b2b3b4b5b6b7ull, 0xb8b9babbbcbdbebfull,
+	0xc0c1c2c3c4c5c6c7ull, 0xc8c9cacbcccdcecfull,
+	0xd0d1d2d3d4d5d6d7ull, 0xd8d9dadbdcdddedfull,
+	0xe0e1e2e3e4e5e6e7ull, 0xe8e9eaebecedeeefull,
+	0xf0f1f2f3f4f5f6f7ull, 0xf8f9fafbfcfdfeffull
+};
+
+/*
+ * First Latin Square
+ * 0   7   1   3   2   4   6   5
+ * 4   1   7   6   3   0   5   2
+ * 7   0   4   2   5   3   1   6
+ * 1   4   0   5   6   2   7   3
+ * 2   3   6   7   1   5   0   4
+ * 5   2   3   1   7   6   4   0
+ * 3   6   5   0   4   7   2   1
+ * 6   5   2   4   0   1   3   7
+ */
+#define	LS1_256(c, x0, x1, x2, x3, x4, x5, x6, x7)			\
+{									\
+	uint32_t x04, x17, x23, x56, x07, x26;				\
+	x04 = x0+x4, x17 = x1+x7, x07 = x04+x17;			\
+	s0 = c + x07 + x2;						\
+	s1 = rotl32(x07 + x3, 4);					\
+	s2 = rotl32(x07 + x6, 8);					\
+	x23 = x2 + x3;							\
+	s5 = rotl32(x04 + x23 + x5, 22);				\
+	x56 = x5 + x6;							\
+	s6 = rotl32(x17 + x56 + x0, 24);				\
+	x26 = x23+x56;							\
+	s3 = rotl32(x26 + x7, 13);					\
+	s4 = rotl32(x26 + x1, 17);					\
+	s7 = rotl32(x26 + x4, 29);					\
+}
+
+#define	LS1_512(c, x0, x1, x2, x3, x4, x5, x6, x7)			\
+{									\
+	uint64_t x04, x17, x23, x56, x07, x26;				\
+	x04 = x0+x4, x17 = x1+x7, x07 = x04+x17;			\
+	s0 = c + x07 + x2;						\
+	s1 = rotl64(x07 + x3, 5);					\
+	s2 = rotl64(x07 + x6, 15);					\
+	x23 = x2 + x3;							\
+	s5 = rotl64(x04 + x23 + x5, 40);				\
+	x56 = x5 + x6;							\
+	s6 = rotl64(x17 + x56 + x0, 50);				\
+	x26 = x23+x56;							\
+	s3 = rotl64(x26 + x7, 22);					\
+	s4 = rotl64(x26 + x1, 31);					\
+	s7 = rotl64(x26 + x4, 59);					\
+}
+
+/*
+ * Second Orthogonal Latin Square
+ * 0   4   2   3   1   6   5   7
+ * 7   6   3   2   5   4   1   0
+ * 5   3   1   6   0   2   7   4
+ * 1   0   5   4   3   7   2   6
+ * 2   1   0   7   4   5   6   3
+ * 3   5   7   0   6   1   4   2
+ * 4   7   6   1   2   0   3   5
+ * 6   2   4   5   7   3   0   1
+ */
+#define	LS2_256(c, y0, y1, y2, y3, y4, y5, y6, y7)			\
+{									\
+	uint32_t y01, y25, y34, y67, y04, y05, y27, y37;		\
+	y01 = y0+y1, y25 = y2+y5, y05 = y01+y25;			\
+	t0  = ~c + y05 + y7;						\
+	t2 = rotl32(y05 + y3, 9);					\
+	y34 = y3+y4, y04 = y01+y34;					\
+	t1 = rotl32(y04 + y6, 5);					\
+	t4 = rotl32(y04 + y5, 15);					\
+	y67 = y6+y7, y37 = y34+y67;					\
+	t3 = rotl32(y37 + y2, 11);					\
+	t7 = rotl32(y37 + y0, 27);					\
+	y27 = y25+y67;							\
+	t5 = rotl32(y27 + y4, 20);					\
+	t6 = rotl32(y27 + y1, 25);					\
+}
+
+#define	LS2_512(c, y0, y1, y2, y3, y4, y5, y6, y7)			\
+{									\
+	uint64_t y01, y25, y34, y67, y04, y05, y27, y37;		\
+	y01 = y0+y1, y25 = y2+y5, y05 = y01+y25;			\
+	t0  = ~c + y05 + y7;						\
+	t2 = rotl64(y05 + y3, 19);					\
+	y34 = y3+y4, y04 = y01+y34;					\
+	t1 = rotl64(y04 + y6, 10);					\
+	t4 = rotl64(y04 + y5, 36);					\
+	y67 = y6+y7, y37 = y34+y67;					\
+	t3 = rotl64(y37 + y2, 29);					\
+	t7 = rotl64(y37 + y0, 55);					\
+	y27 = y25+y67;							\
+	t5 = rotl64(y27 + y4, 44);					\
+	t6 = rotl64(y27 + y1, 48);					\
+}
+
+#define	quasi_exform256(r0, r1, r2, r3, r4, r5, r6, r7)			\
+{									\
+	uint32_t s04, s17, s23, s56, t01, t25, t34, t67;		\
+	s04 = s0 ^ s4, t01 = t0 ^ t1;					\
+	r0 = (s04 ^ s1) + (t01 ^ t5);					\
+	t67 = t6 ^ t7;							\
+	r1 = (s04 ^ s7) + (t2 ^ t67);					\
+	s23 = s2 ^ s3;							\
+	r7 = (s23 ^ s5) + (t4 ^ t67);					\
+	t34 = t3 ^ t4;							\
+	r3 = (s23 ^ s4) + (t0 ^ t34);					\
+	s56 = s5 ^ s6;							\
+	r5 = (s3 ^ s56) + (t34 ^ t6);					\
+	t25 = t2 ^ t5;							\
+	r6 = (s2 ^ s56) + (t25 ^ t7);					\
+	s17 = s1 ^ s7;							\
+	r4 = (s0 ^ s17) + (t1 ^ t25);					\
+	r2 = (s17 ^ s6) + (t01 ^ t3);					\
+}
+
+#define	quasi_exform512(r0, r1, r2, r3, r4, r5, r6, r7)			\
+{									\
+	uint64_t s04, s17, s23, s56, t01, t25, t34, t67;		\
+	s04 = s0 ^ s4, t01 = t0 ^ t1;					\
+	r0 = (s04 ^ s1) + (t01 ^ t5);					\
+	t67 = t6 ^ t7;							\
+	r1 = (s04 ^ s7) + (t2 ^ t67);					\
+	s23 = s2 ^ s3;							\
+	r7 = (s23 ^ s5) + (t4 ^ t67);					\
+	t34 = t3 ^ t4;							\
+	r3 = (s23 ^ s4) + (t0 ^ t34);					\
+	s56 = s5 ^ s6;							\
+	r5 = (s3 ^ s56) + (t34 ^ t6);					\
+	t25 = t2 ^ t5;							\
+	r6 = (s2 ^ s56) + (t25 ^ t7);					\
+	s17 = s1 ^ s7;							\
+	r4 = (s0 ^ s17) + (t1 ^ t25);					\
+	r2 = (s17 ^ s6) + (t01 ^ t3);					\
+}
+
+static size_t
+Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p)
+{
+	size_t bl;
+
+	for (bl = bitlen; bl >= EdonR256_BLOCK_BITSIZE;
+	    bl -= EdonR256_BLOCK_BITSIZE, data += 16) {
+		uint32_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+		    t5, t6, t7;
+		uint32_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+		    q5, q6, q7;
+		const uint32_t defix = 0xaaaaaaaa;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+		    swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define	d(j)	swp ## j
+#define	s32(j)	ld_swap32((uint32_t *)data + j, swp ## j)
+#else
+#define	d(j)	data[j]
+#endif
+
+		/* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s32(8);
+		s32(9);
+		s32(10);
+		s32(11);
+		s32(12);
+		s32(13);
+		s32(14);
+		s32(15);
+#endif
+		LS1_256(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+		    d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s32(0);
+		s32(1);
+		s32(2);
+		s32(3);
+		s32(4);
+		s32(5);
+		s32(6);
+		s32(7);
+#undef s32
+#endif
+		LS2_256(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+		    d(15));
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Second row of quasigroup e-transformations */
+		LS1_256(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+		    p[15]);
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Third row of quasigroup e-transformations */
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Fourth row of quasigroup e-transformations */
+		LS1_256(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Edon-R tweak on the original SHA-3 Edon-R submission. */
+		p[0] ^= d(8) ^ p0;
+		p[1] ^= d(9) ^ p1;
+		p[2] ^= d(10) ^ p2;
+		p[3] ^= d(11) ^ p3;
+		p[4] ^= d(12) ^ p4;
+		p[5] ^= d(13) ^ p5;
+		p[6] ^= d(14) ^ p6;
+		p[7] ^= d(15) ^ p7;
+		p[8] ^= d(0) ^ q0;
+		p[9] ^= d(1) ^ q1;
+		p[10] ^= d(2) ^ q2;
+		p[11] ^= d(3) ^ q3;
+		p[12] ^= d(4) ^ q4;
+		p[13] ^= d(5) ^ q5;
+		p[14] ^= d(6) ^ q6;
+		p[15] ^= d(7) ^ q7;
+	}
+
+#undef d
+	return (bitlen - bl);
+}
+
+/*
+ * Why is this #pragma here?
+ *
+ * Checksum functions like this one can go over the stack frame size check
+ * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024).  We can
+ * safely ignore the compiler error since we know that in ZoL, that
+ * the function will be called from a worker thread that won't be using
+ * much stack.  The only function that goes over the 1k limit is Q512(),
+ * which only goes over it by a hair (1248 bytes on ARM32).
+ */
+#include <sys/isa_defs.h>	/* for _ILP32 */
+#ifdef _ILP32   /* We're 32-bit, assume small stack frames */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__)
+static inline size_t
+#else
+static size_t
+#endif
+Q512(size_t bitlen, const uint64_t *data, uint64_t *restrict p)
+{
+	size_t bl;
+
+	for (bl = bitlen; bl >= EdonR512_BLOCK_BITSIZE;
+	    bl -= EdonR512_BLOCK_BITSIZE, data += 16) {
+		uint64_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+		    t5, t6, t7;
+		uint64_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+		    q5, q6, q7;
+		const uint64_t defix = 0xaaaaaaaaaaaaaaaaull;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+		    swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define	d(j)	swp##j
+#define	s64(j)	ld_swap64((uint64_t *)data+j, swp##j)
+#else
+#define	d(j)	data[j]
+#endif
+
+		/* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s64(8);
+		s64(9);
+		s64(10);
+		s64(11);
+		s64(12);
+		s64(13);
+		s64(14);
+		s64(15);
+#endif
+		LS1_512(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+		    d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s64(0);
+		s64(1);
+		s64(2);
+		s64(3);
+		s64(4);
+		s64(5);
+		s64(6);
+		s64(7);
+#undef s64
+#endif
+		LS2_512(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+		    d(15));
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Second row of quasigroup e-transformations */
+		LS1_512(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+		    p[15]);
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Third row of quasigroup e-transformations */
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Fourth row of quasigroup e-transformations */
+		LS1_512(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Edon-R tweak on the original SHA-3 Edon-R submission. */
+		p[0] ^= d(8) ^ p0;
+		p[1] ^= d(9) ^ p1;
+		p[2] ^= d(10) ^ p2;
+		p[3] ^= d(11) ^ p3;
+		p[4] ^= d(12) ^ p4;
+		p[5] ^= d(13) ^ p5;
+		p[6] ^= d(14) ^ p6;
+		p[7] ^= d(15) ^ p7;
+		p[8] ^= d(0) ^ q0;
+		p[9] ^= d(1) ^ q1;
+		p[10] ^= d(2) ^ q2;
+		p[11] ^= d(3) ^ q3;
+		p[12] ^= d(4) ^ q4;
+		p[13] ^= d(5) ^ q5;
+		p[14] ^= d(6) ^ q6;
+		p[15] ^= d(7) ^ q7;
+	}
+
+#undef d
+	return (bitlen - bl);
+}
+
+void
+EdonRInit(EdonRState *state, size_t hashbitlen)
+{
+	ASSERT(EDONR_VALID_HASHBITLEN(hashbitlen));
+	switch (hashbitlen) {
+	case 224:
+		state->hashbitlen = 224;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i224p2, hashState224(state)->DoublePipe,
+		    16 * sizeof (uint32_t));
+		break;
+
+	case 256:
+		state->hashbitlen = 256;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i256p2, hashState256(state)->DoublePipe,
+		    16 * sizeof (uint32_t));
+		break;
+
+	case 384:
+		state->hashbitlen = 384;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i384p2, hashState384(state)->DoublePipe,
+		    16 * sizeof (uint64_t));
+		break;
+
+	case 512:
+		state->hashbitlen = 512;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i512p2, hashState224(state)->DoublePipe,
+		    16 * sizeof (uint64_t));
+		break;
+	}
+}
+
+
+void
+EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen)
+{
+	uint32_t *data32;
+	uint64_t *data64;
+
+	size_t bits_processed;
+
+	ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+	switch (state->hashbitlen) {
+	case 224:
+	case 256:
+		if (state->unprocessed_bits > 0) {
+			/* LastBytes = databitlen / 8 */
+			int LastBytes = (int)databitlen >> 3;
+
+			ASSERT(state->unprocessed_bits + databitlen <=
+			    EdonR256_BLOCK_SIZE * 8);
+
+			bcopy(data, hashState256(state)->LastPart
+			    + (state->unprocessed_bits >> 3), LastBytes);
+			state->unprocessed_bits += (int)databitlen;
+			databitlen = state->unprocessed_bits;
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data32 = (uint32_t *)hashState256(state)->LastPart;
+		} else
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data32 = (uint32_t *)data;
+
+		bits_processed = Q256(databitlen, data32,
+		    hashState256(state)->DoublePipe);
+		state->bits_processed += bits_processed;
+		databitlen -= bits_processed;
+		state->unprocessed_bits = (int)databitlen;
+		if (databitlen > 0) {
+			/* LastBytes = Ceil(databitlen / 8) */
+			int LastBytes =
+			    ((~(((-(int)databitlen) >> 3) & 0x01ff)) +
+			    1) & 0x01ff;
+
+			data32 += bits_processed >> 5;	/* byte size update */
+			bcopy(data32, hashState256(state)->LastPart, LastBytes);
+		}
+		break;
+
+	case 384:
+	case 512:
+		if (state->unprocessed_bits > 0) {
+			/* LastBytes = databitlen / 8 */
+			int LastBytes = (int)databitlen >> 3;
+
+			ASSERT(state->unprocessed_bits + databitlen <=
+			    EdonR512_BLOCK_SIZE * 8);
+
+			bcopy(data, hashState512(state)->LastPart
+			    + (state->unprocessed_bits >> 3), LastBytes);
+			state->unprocessed_bits += (int)databitlen;
+			databitlen = state->unprocessed_bits;
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data64 = (uint64_t *)hashState512(state)->LastPart;
+		} else
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data64 = (uint64_t *)data;
+
+		bits_processed = Q512(databitlen, data64,
+		    hashState512(state)->DoublePipe);
+		state->bits_processed += bits_processed;
+		databitlen -= bits_processed;
+		state->unprocessed_bits = (int)databitlen;
+		if (databitlen > 0) {
+			/* LastBytes = Ceil(databitlen / 8) */
+			int LastBytes =
+			    ((~(((-(int)databitlen) >> 3) & 0x03ff)) +
+			    1) & 0x03ff;
+
+			data64 += bits_processed >> 6;	/* byte size update */
+			bcopy(data64, hashState512(state)->LastPart, LastBytes);
+		}
+		break;
+	}
+}
+
+void
+EdonRFinal(EdonRState *state, uint8_t *hashval)
+{
+	uint32_t *data32;
+	uint64_t *data64, num_bits;
+
+	size_t databitlen;
+	int LastByte, PadOnePosition;
+
+	num_bits = state->bits_processed + state->unprocessed_bits;
+	ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+	switch (state->hashbitlen) {
+	case 224:
+	case 256:
+		LastByte = (int)state->unprocessed_bits >> 3;
+		PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+		hashState256(state)->LastPart[LastByte] =
+		    (hashState256(state)->LastPart[LastByte]
+		    & (0xff << (PadOnePosition + 1))) ^
+		    (0x01 << PadOnePosition);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data64 = (uint64_t *)hashState256(state)->LastPart;
+
+		if (state->unprocessed_bits < 448) {
+			(void) memset((hashState256(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR256_BLOCK_SIZE - LastByte - 9);
+			databitlen = EdonR256_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 7);
+#else
+			data64[7] = num_bits;
+#endif
+		} else {
+			(void) memset((hashState256(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR256_BLOCK_SIZE * 2 - LastByte - 9);
+			databitlen = EdonR256_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 15);
+#else
+			data64[15] = num_bits;
+#endif
+		}
+
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data32 = (uint32_t *)hashState256(state)->LastPart;
+		state->bits_processed += Q256(databitlen, data32,
+		    hashState256(state)->DoublePipe);
+		break;
+
+	case 384:
+	case 512:
+		LastByte = (int)state->unprocessed_bits >> 3;
+		PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+		hashState512(state)->LastPart[LastByte] =
+		    (hashState512(state)->LastPart[LastByte]
+		    & (0xff << (PadOnePosition + 1))) ^
+		    (0x01 << PadOnePosition);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data64 = (uint64_t *)hashState512(state)->LastPart;
+
+		if (state->unprocessed_bits < 960) {
+			(void) memset((hashState512(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR512_BLOCK_SIZE - LastByte - 9);
+			databitlen = EdonR512_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 15);
+#else
+			data64[15] = num_bits;
+#endif
+		} else {
+			(void) memset((hashState512(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR512_BLOCK_SIZE * 2 - LastByte - 9);
+			databitlen = EdonR512_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 31);
+#else
+			data64[31] = num_bits;
+#endif
+		}
+
+		state->bits_processed += Q512(databitlen, data64,
+		    hashState512(state)->DoublePipe);
+		break;
+	}
+
+	switch (state->hashbitlen) {
+	case 224: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t *d32 = (uint32_t *)hashval;
+		uint32_t *s32 = hashState224(state)->DoublePipe + 9;
+		int j;
+
+		for (j = 0; j < EdonR224_DIGEST_SIZE >> 2; j++)
+			st_swap32(s32[j], d32 + j);
+#else
+		bcopy(hashState256(state)->DoublePipe + 9, hashval,
+		    EdonR224_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 256: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t *d32 = (uint32_t *)hashval;
+		uint32_t *s32 = hashState224(state)->DoublePipe + 8;
+		int j;
+
+		for (j = 0; j < EdonR256_DIGEST_SIZE >> 2; j++)
+			st_swap32(s32[j], d32 + j);
+#else
+		bcopy(hashState256(state)->DoublePipe + 8, hashval,
+		    EdonR256_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 384: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t *d64 = (uint64_t *)hashval;
+		uint64_t *s64 = hashState384(state)->DoublePipe + 10;
+		int j;
+
+		for (j = 0; j < EdonR384_DIGEST_SIZE >> 3; j++)
+			st_swap64(s64[j], d64 + j);
+#else
+		bcopy(hashState384(state)->DoublePipe + 10, hashval,
+		    EdonR384_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 512: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t *d64 = (uint64_t *)hashval;
+		uint64_t *s64 = hashState512(state)->DoublePipe + 8;
+		int j;
+
+		for (j = 0; j < EdonR512_DIGEST_SIZE >> 3; j++)
+			st_swap64(s64[j], d64 + j);
+#else
+		bcopy(hashState512(state)->DoublePipe + 8, hashval,
+		    EdonR512_DIGEST_SIZE);
+#endif
+		break;
+	}
+	}
+}
+
+
+void
+EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen,
+    uint8_t *hashval)
+{
+	EdonRState state;
+
+	EdonRInit(&state, hashbitlen);
+	EdonRUpdate(&state, data, databitlen);
+	EdonRFinal(&state, hashval);
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(EdonRInit);
+EXPORT_SYMBOL(EdonRUpdate);
+EXPORT_SYMBOL(EdonRHash);
+EXPORT_SYMBOL(EdonRFinal);
+#endif
diff --git a/module/icp/algs/edonr/edonr_byteorder.h b/module/icp/algs/edonr/edonr_byteorder.h
new file mode 100644
index 000000000..d17e8f1fd
--- /dev/null
+++ b/module/icp/algs/edonr/edonr_byteorder.h
@@ -0,0 +1,216 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]>
+ *
+ * C header file to determine compile machine byte order. Take care when cross
+ * compiling.
+ *
+ * $Id: byteorder.h 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#ifndef _CRYPTO_EDONR_BYTEORDER_H
+#define	_CRYPTO_EDONR_BYTEORDER_H
+
+
+#include <sys/param.h>
+
+#if defined(__BYTE_ORDER)
+#if (__BYTE_ORDER == __BIG_ENDIAN)
+#define	MACHINE_IS_BIG_ENDIAN
+#elif (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#elif defined(BYTE_ORDER)
+#if (BYTE_ORDER == BIG_ENDIAN)
+#define	MACHINE_IS_BIG_ENDIAN
+#elif (BYTE_ORDER == LITTLE_ENDIAN)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* __BYTE_ORDER || BYTE_ORDER */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#if defined(_BIG_ENDIAN) || defined(_MIPSEB)
+#define	MACHINE_IS_BIG_ENDIAN
+#endif
+#if defined(_LITTLE_ENDIAN) || defined(_MIPSEL)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#error unknown machine byte sex
+#endif
+
+#define	BYTEORDER_INCLUDED
+
+#if defined(MACHINE_IS_BIG_ENDIAN)
+/*
+ * Byte swapping macros for big endian architectures and compilers,
+ * add as appropriate for other architectures and/or compilers.
+ *
+ *     ld_swap64(src,dst) : uint64_t dst = *(src)
+ *     st_swap64(src,dst) : *(dst)       = uint64_t src
+ */
+
+#if defined(__PPC__) || defined(_ARCH_PPC)
+
+#if defined(__64BIT__)
+#if defined(_ARCH_PWR7)
+#define	aix_ld_swap64(s64, d64)\
+	__asm__("ldbrx %0,0,%1" : "=r"(d64) : "r"(s64))
+#define	aix_st_swap64(s64, d64)\
+	__asm__ volatile("stdbrx %1,0,%0" : : "r"(d64), "r"(s64))
+#else
+#define	aix_ld_swap64(s64, d64)						\
+{									\
+	uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */	\
+									\
+	__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\
+		: "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64));		\
+}
+
+#define	aix_st_swap64(s64, d64)						\
+{									\
+	uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */	\
+	h = (s64) >> 32;						\
+	__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0"	\
+		: "+r"(s4) : "r"(s64), "r"(h), "b"(d64));		\
+}
+#endif /* 64BIT && PWR7 */
+#else
+#define	aix_ld_swap64(s64, d64)						\
+{									\
+	uint32_t *s4 = 0, h, l;	/* initialize to zero for gcc warning */\
+	__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0"		\
+		: "+r"(s4), "=r"(l), "=r"(h) : "b"(s64));		\
+	d64 = ((uint64_t)h<<32) | l;					\
+}
+
+#define	aix_st_swap64(s64, d64)						\
+{									\
+	uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
+	l = (s64) & 0xfffffffful, h = (s64) >> 32;			\
+	__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0"	\
+		: "+r"(s4) : "r"(l), "r"(h), "b"(d64));			\
+}
+#endif /* __64BIT__ */
+#define	aix_ld_swap32(s32, d32)\
+	__asm__("lwbrx %0,0,%1" : "=r"(d32) : "r"(s32))
+#define	aix_st_swap32(s32, d32)\
+	__asm__ volatile("stwbrx %1,0,%0" : : "r"(d32), "r"(s32))
+#define	ld_swap32(s, d) aix_ld_swap32(s, d)
+#define	st_swap32(s, d) aix_st_swap32(s, d)
+#define	ld_swap64(s, d) aix_ld_swap64(s, d)
+#define	st_swap64(s, d) aix_st_swap64(s, d)
+#endif /* __PPC__ || _ARCH_PPC */
+
+#if defined(__sparc)
+#if !defined(__arch64__) && !defined(__sparcv8) && defined(__sparcv9)
+#define	__arch64__
+#endif
+#if defined(__GNUC__) || (defined(__SUNPRO_C) && __SUNPRO_C > 0x590)
+/* need Sun Studio C 5.10 and above for GNU inline assembly */
+#if defined(__arch64__)
+#define	sparc_ld_swap64(s64, d64)					\
+	__asm__("ldxa [%1]0x88,%0" : "=r"(d64) : "r"(s64))
+#define	sparc_st_swap64(s64, d64)					\
+	__asm__ volatile("stxa %0,[%1]0x88" : : "r"(s64), "r"(d64))
+#define	st_swap64(s, d) sparc_st_swap64(s, d)
+#else
+#define	sparc_ld_swap64(s64, d64)					\
+{									\
+	uint32_t *s4, h, l;						\
+	__asm__("add %3,4,%0\n\tlda [%3]0x88,%1\n\tlda [%0]0x88,%2"	\
+		: "+r"(s4), "=r"(l), "=r"(h) : "r"(s64));		\
+	d64 = ((uint64_t)h<<32) | l;					\
+}
+#define	sparc_st_swap64(s64, d64)					\
+{									\
+	uint32_t *s4, h, l;						\
+	l = (s64) & 0xfffffffful, h = (s64) >> 32;			\
+	__asm__ volatile("add %3,4,%0\n\tsta %1,[%3]0x88\n\tsta %2,[%0]0x88"\
+		: "+r"(s4) : "r"(l), "r"(h), "r"(d64));			\
+}
+#endif /* sparc64 */
+#define	sparc_ld_swap32(s32, d32)\
+	__asm__("lda [%1]0x88,%0" : "=r"(d32) : "r"(s32))
+#define	sparc_st_swap32(s32, d32)\
+	__asm__ volatile("sta %0,[%1]0x88" : : "r"(s32), "r"(d32))
+#define	ld_swap32(s, d) sparc_ld_swap32(s, d)
+#define	st_swap32(s, d) sparc_st_swap32(s, d)
+#define	ld_swap64(s, d) sparc_ld_swap64(s, d)
+#define	st_swap64(s, d) sparc_st_swap64(s, d)
+#endif /* GCC || Sun Studio C > 5.9 */
+#endif /* sparc */
+
+/* GCC fallback */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap32)
+#define	ld_swap32(s, d) (d = __builtin_bswap32(*(s)))
+#define	st_swap32(s, d) (*(d) = __builtin_bswap32(s))
+#endif /* GCC4/PGIC && !swap32 */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap64)
+#define	ld_swap64(s, d) (d = __builtin_bswap64(*(s)))
+#define	st_swap64(s, d) (*(d) = __builtin_bswap64(s))
+#endif /* GCC4/PGIC && !swap64 */
+
+/* generic fallback */
+#if !defined(ld_swap32)
+#define	ld_swap32(s, d)							\
+	(d = (*(s) >> 24) | (*(s) >> 8 & 0xff00) |			\
+	(*(s) << 8 & 0xff0000) | (*(s) << 24))
+#define	st_swap32(s, d)							\
+	(*(d) = ((s) >> 24) | ((s) >> 8 & 0xff00) |			\
+	((s) << 8 & 0xff0000) | ((s) << 24))
+#endif
+#if !defined(ld_swap64)
+#define	ld_swap64(s, d)							\
+	(d = (*(s) >> 56) | (*(s) >> 40 & 0xff00) |			\
+	(*(s) >> 24 & 0xff0000) | (*(s) >> 8 & 0xff000000) |		\
+	(*(s) & 0xff000000) << 8 | (*(s) & 0xff0000) << 24 |		\
+	(*(s) & 0xff00) << 40 | *(s) << 56)
+#define	st_swap64(s, d)							\
+	(*(d) = ((s) >> 56) | ((s) >> 40 & 0xff00) |			\
+	((s) >> 24 & 0xff0000) | ((s) >> 8 & 0xff000000) |		\
+	((s) & 0xff000000) << 8 | ((s) & 0xff0000) << 24 |		\
+	((s) & 0xff00) << 40 | (s) << 56)
+#endif
+
+#endif /* MACHINE_IS_BIG_ENDIAN */
+
+
+#if defined(MACHINE_IS_LITTLE_ENDIAN)
+/* replace swaps with simple assignments on little endian systems */
+#undef	ld_swap32
+#undef	st_swap32
+#define	ld_swap32(s, d) (d = *(s))
+#define	st_swap32(s, d) (*(d) = s)
+#undef	ld_swap64
+#undef	st_swap64
+#define	ld_swap64(s, d) (d = *(s))
+#define	st_swap64(s, d) (*(d) = s)
+#endif /* MACHINE_IS_LITTLE_ENDIAN */
+
+#endif /* _CRYPTO_EDONR_BYTEORDER_H */
diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c
index 792ca8825..dbe008190 100644
--- a/module/icp/algs/sha2/sha2.c
+++ b/module/icp/algs/sha2/sha2.c
@@ -38,7 +38,7 @@
 
 #include <sys/zfs_context.h>
 #define	_SHA2_IMPL
-#include <sha2/sha2.h>
+#include <sys/sha2.h>
 #include <sha2/sha2_consts.h>
 
 #define	_RESTRICT_KYWD
@@ -47,18 +47,37 @@
 #include <sys/byteorder.h>
 #define	HAVE_HTONL
 #endif
+#include <sys/isa_defs.h>	/* for _ILP32 */
 
 static void Encode(uint8_t *, uint32_t *, size_t);
+static void Encode64(uint8_t *, uint64_t *, size_t);
 
 #if	defined(__amd64)
+#define	SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1)
 #define	SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1)
+
+void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
 void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
+
 #else
 static void SHA256Transform(SHA2_CTX *, const uint8_t *);
+static void SHA512Transform(SHA2_CTX *, const uint8_t *);
 #endif	/* __amd64 */
 
 static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
 
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks are enforced (like 32-bit kernel builds), insert compiler memory
+ * barriers to reduce stack frame size. This can reduce the SHA512Transform()
+ * stack frame usage from 3k to <1k on ARM32, for example.
+ */
+#if defined(_ILP32) || defined(__powerpc)	/* small stack */
+#define	SMALL_STACK_MEMORY_BARRIER	asm volatile("": : :"memory");
+#else
+#define	SMALL_STACK_MEMORY_BARRIER
+#endif
+
 /* Ch and Maj are the basic SHA2 functions. */
 #define	Ch(b, c, d)	(((b) & (c)) ^ ((~b) & (d)))
 #define	Maj(b, c, d)	(((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d)))
@@ -82,6 +101,18 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
 	T2 = BIGSIGMA0_256(a) + Maj(a, b, c);				\
 	h = T1 + T2
 
+/* SHA384/512 Functions */
+#define	BIGSIGMA0(x)	(ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
+#define	BIGSIGMA1(x)	(ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
+#define	SIGMA0(x)	(ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7))
+#define	SIGMA1(x)	(ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6))
+#define	SHA512ROUND(a, b, c, d, e, f, g, h, i, w)			\
+	T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w;	\
+	d += T1;							\
+	T2 = BIGSIGMA0(a) + Maj(a, b, c);				\
+	h = T1 + T2;							\
+	SMALL_STACK_MEMORY_BARRIER;
+
 /*
  * sparc optimization:
  *
@@ -130,6 +161,33 @@ SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk)
 	uint32_t w8, w9, w10, w11, w12, w13, w14, w15;
 	uint32_t T1, T2;
 
+#if	defined(__sparc)
+	static const uint32_t sha256_consts[] = {
+		SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2,
+		SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5,
+		SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8,
+		SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11,
+		SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14,
+		SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17,
+		SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20,
+		SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23,
+		SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26,
+		SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29,
+		SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32,
+		SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35,
+		SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38,
+		SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41,
+		SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44,
+		SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47,
+		SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50,
+		SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53,
+		SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56,
+		SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59,
+		SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62,
+		SHA256_CONST_63
+	};
+#endif	/* __sparc */
+
 	if ((uintptr_t)blk & 0x3) {		/* not 4-byte aligned? */
 		bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
 		blk = (uint8_t *)ctx->buf_un.buf32;
@@ -292,6 +350,256 @@ SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk)
 	ctx->state.s32[6] += g;
 	ctx->state.s32[7] += h;
 }
+
+
+/* SHA384 and SHA512 Transform */
+
+static void
+SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk)
+{
+
+	uint64_t a = ctx->state.s64[0];
+	uint64_t b = ctx->state.s64[1];
+	uint64_t c = ctx->state.s64[2];
+	uint64_t d = ctx->state.s64[3];
+	uint64_t e = ctx->state.s64[4];
+	uint64_t f = ctx->state.s64[5];
+	uint64_t g = ctx->state.s64[6];
+	uint64_t h = ctx->state.s64[7];
+
+	uint64_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint64_t w8, w9, w10, w11, w12, w13, w14, w15;
+	uint64_t T1, T2;
+
+#if	defined(__sparc)
+	static const uint64_t sha512_consts[] = {
+		SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2,
+		SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5,
+		SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8,
+		SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11,
+		SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14,
+		SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17,
+		SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20,
+		SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23,
+		SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26,
+		SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29,
+		SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32,
+		SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35,
+		SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38,
+		SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41,
+		SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44,
+		SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47,
+		SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50,
+		SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53,
+		SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56,
+		SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59,
+		SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62,
+		SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65,
+		SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68,
+		SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71,
+		SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74,
+		SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77,
+		SHA512_CONST_78, SHA512_CONST_79
+	};
+#endif	/* __sparc */
+
+
+	if ((uintptr_t)blk & 0x7) {		/* not 8-byte aligned? */
+		bcopy(blk, ctx->buf_un.buf64,  sizeof (ctx->buf_un.buf64));
+		blk = (uint8_t *)ctx->buf_un.buf64;
+	}
+
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w0 =  LOAD_BIG_64(blk + 8 * 0);
+	SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w1 =  LOAD_BIG_64(blk + 8 * 1);
+	SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w2 =  LOAD_BIG_64(blk + 8 * 2);
+	SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w3 =  LOAD_BIG_64(blk + 8 * 3);
+	SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w4 =  LOAD_BIG_64(blk + 8 * 4);
+	SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w5 =  LOAD_BIG_64(blk + 8 * 5);
+	SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w6 =  LOAD_BIG_64(blk + 8 * 6);
+	SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w7 =  LOAD_BIG_64(blk + 8 * 7);
+	SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w8 =  LOAD_BIG_64(blk + 8 * 8);
+	SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w9 =  LOAD_BIG_64(blk + 8 * 9);
+	SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w10 =  LOAD_BIG_64(blk + 8 * 10);
+	SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w11 =  LOAD_BIG_64(blk + 8 * 11);
+	SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w12 =  LOAD_BIG_64(blk + 8 * 12);
+	SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w13 =  LOAD_BIG_64(blk + 8 * 13);
+	SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w14 =  LOAD_BIG_64(blk + 8 * 14);
+	SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w15 =  LOAD_BIG_64(blk + 8 * 15);
+	SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15);
+
+	ctx->state.s64[0] += a;
+	ctx->state.s64[1] += b;
+	ctx->state.s64[2] += c;
+	ctx->state.s64[3] += d;
+	ctx->state.s64[4] += e;
+	ctx->state.s64[5] += f;
+	ctx->state.s64[6] += g;
+	ctx->state.s64[7] += h;
+
+}
 #endif	/* !__amd64 */
 
 
@@ -311,14 +619,56 @@ Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input,
 {
 	size_t		i, j;
 
-	for (i = 0, j = 0; j < len; i++, j += 4) {
-		output[j]	= (input[i] >> 24) & 0xff;
-		output[j + 1]	= (input[i] >> 16) & 0xff;
-		output[j + 2]	= (input[i] >>  8) & 0xff;
-		output[j + 3]	= input[i] & 0xff;
+#if	defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint32_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif	/* little endian -- will work on big endian, but slowly */
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			output[j]	= (input[i] >> 24) & 0xff;
+			output[j + 1]	= (input[i] >> 16) & 0xff;
+			output[j + 2]	= (input[i] >>  8) & 0xff;
+			output[j + 3]	= input[i] & 0xff;
+		}
+#if	defined(__sparc)
 	}
+#endif
 }
 
+static void
+Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input,
+    size_t len)
+{
+	size_t		i, j;
+
+#if	defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint64_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 8) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint64_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif	/* little endian -- will work on big endian, but slowly */
+		for (i = 0, j = 0; j < len; i++, j += 8) {
+
+			output[j]	= (input[i] >> 56) & 0xff;
+			output[j + 1]	= (input[i] >> 48) & 0xff;
+			output[j + 2]	= (input[i] >> 40) & 0xff;
+			output[j + 3]	= (input[i] >> 32) & 0xff;
+			output[j + 4]	= (input[i] >> 24) & 0xff;
+			output[j + 5]	= (input[i] >> 16) & 0xff;
+			output[j + 6]	= (input[i] >>  8) & 0xff;
+			output[j + 7]	= input[i] & 0xff;
+		}
+#if	defined(__sparc)
+	}
+#endif
+}
+
+
 void
 SHA2Init(uint64_t mech, SHA2_CTX *ctx)
 {
@@ -336,22 +686,86 @@ SHA2Init(uint64_t mech, SHA2_CTX *ctx)
 		ctx->state.s32[6] = 0x1f83d9abU;
 		ctx->state.s32[7] = 0x5be0cd19U;
 		break;
+	case SHA384_MECH_INFO_TYPE:
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL;
+		ctx->state.s64[1] = 0x629a292a367cd507ULL;
+		ctx->state.s64[2] = 0x9159015a3070dd17ULL;
+		ctx->state.s64[3] = 0x152fecd8f70e5939ULL;
+		ctx->state.s64[4] = 0x67332667ffc00b31ULL;
+		ctx->state.s64[5] = 0x8eb44a8768581511ULL;
+		ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL;
+		ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL;
+		break;
+	case SHA512_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x6a09e667f3bcc908ULL;
+		ctx->state.s64[1] = 0xbb67ae8584caa73bULL;
+		ctx->state.s64[2] = 0x3c6ef372fe94f82bULL;
+		ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL;
+		ctx->state.s64[4] = 0x510e527fade682d1ULL;
+		ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL;
+		ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL;
+		ctx->state.s64[7] = 0x5be0cd19137e2179ULL;
+		break;
+	case SHA512_224_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x8C3D37C819544DA2ULL;
+		ctx->state.s64[1] = 0x73E1996689DCD4D6ULL;
+		ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL;
+		ctx->state.s64[3] = 0x679DD514582F9FCFULL;
+		ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL;
+		ctx->state.s64[5] = 0x77E36F7304C48942ULL;
+		ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL;
+		ctx->state.s64[7] = 0x1112E6AD91D692A1ULL;
+		break;
+	case SHA512_256_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x22312194FC2BF72CULL;
+		ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL;
+		ctx->state.s64[2] = 0x2393B86B6F53B151ULL;
+		ctx->state.s64[3] = 0x963877195940EABDULL;
+		ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL;
+		ctx->state.s64[5] = 0xBE5E1E2553863992ULL;
+		ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL;
+		ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL;
+		break;
+#ifdef _KERNEL
 	default:
 		cmn_err(CE_PANIC,
 		    "sha2_init: failed to find a supported algorithm: 0x%x",
 		    (uint32_t)mech);
+
+#endif /* _KERNEL */
 	}
 
 	ctx->algotype = (uint32_t)mech;
 	ctx->count.c64[0] = ctx->count.c64[1] = 0;
 }
 
+#ifndef _KERNEL
+
+// #pragma inline(SHA256Init, SHA384Init, SHA512Init)
 void
 SHA256Init(SHA256_CTX *ctx)
 {
 	SHA2Init(SHA256, ctx);
 }
 
+void
+SHA384Init(SHA384_CTX *ctx)
+{
+	SHA2Init(SHA384, ctx);
+}
+
+void
+SHA512Init(SHA512_CTX *ctx)
+{
+	SHA2Init(SHA512, ctx);
+}
+
+#endif /* _KERNEL */
+
 /*
  * SHA2Update()
  *
@@ -422,6 +836,8 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
 			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
 			if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE)
 				SHA256Transform(ctx, ctx->buf_un.buf8);
+			else
+				SHA512Transform(ctx, ctx->buf_un.buf8);
 
 			i = buf_len;
 		}
@@ -431,6 +847,10 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
 			for (; i + buf_limit - 1 < input_len; i += buf_limit) {
 				SHA256Transform(ctx, &input[i]);
 			}
+		} else {
+			for (; i + buf_limit - 1 < input_len; i += buf_limit) {
+				SHA512Transform(ctx, &input[i]);
+			}
 		}
 
 #else
@@ -441,6 +861,13 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
 				    block_count);
 				i += block_count << 6;
 			}
+		} else {
+			block_count = (input_len - i) >> 7;
+			if (block_count > 0) {
+				SHA512TransformBlocks(ctx, &input[i],
+				    block_count);
+				i += block_count << 7;
+			}
 		}
 #endif	/* !__amd64 */
 
@@ -479,6 +906,7 @@ void
 SHA2Final(void *digest, SHA2_CTX *ctx)
 {
 	uint8_t		bitcount_be[sizeof (ctx->count.c32)];
+	uint8_t		bitcount_be64[sizeof (ctx->count.c64)];
 	uint32_t	index;
 	uint32_t	algotype = ctx->algotype;
 
@@ -488,8 +916,45 @@ SHA2Final(void *digest, SHA2_CTX *ctx)
 		SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
 		SHA2Update(ctx, bitcount_be, sizeof (bitcount_be));
 		Encode(digest, ctx->state.s32, sizeof (ctx->state.s32));
+	} else {
+		index  = (ctx->count.c64[1] >> 3) & 0x7f;
+		Encode64(bitcount_be64, ctx->count.c64,
+		    sizeof (bitcount_be64));
+		SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index);
+		SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64));
+		if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) {
+			ctx->state.s64[6] = ctx->state.s64[7] = 0;
+			Encode64(digest, ctx->state.s64,
+			    sizeof (uint64_t) * 6);
+		} else if (algotype == SHA512_224_MECH_INFO_TYPE) {
+			uint8_t last[sizeof (uint64_t)];
+			/*
+			 * Since SHA-512/224 doesn't align well to 64-bit
+			 * boundaries, we must do the encoding in three steps:
+			 * 1) encode the three 64-bit words that fit neatly
+			 * 2) encode the last 64-bit word to a temp buffer
+			 * 3) chop out the lower 32-bits from the temp buffer
+			 *    and append them to the digest
+			 */
+			Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3);
+			Encode64(last, &ctx->state.s64[3], sizeof (uint64_t));
+			bcopy(last, (uint8_t *)digest + 24, 4);
+		} else if (algotype == SHA512_256_MECH_INFO_TYPE) {
+			Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4);
+		} else {
+			Encode64(digest, ctx->state.s64,
+			    sizeof (ctx->state.s64));
+		}
 	}
 
 	/* zeroize sensitive information */
 	bzero(ctx, sizeof (*ctx));
 }
+
+
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(SHA2Init);
+EXPORT_SYMBOL(SHA2Update);
+EXPORT_SYMBOL(SHA2Final);
+#endif
diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE b/module/icp/algs/skein/THIRDPARTYLICENSE
new file mode 100644
index 000000000..b7434fd17
--- /dev/null
+++ b/module/icp/algs/skein/THIRDPARTYLICENSE
@@ -0,0 +1,3 @@
+Implementation of the Skein hash function.
+Source code author: Doug Whiting, 2008.
+This algorithm and source code is released to the public domain.
diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE.descrip b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
new file mode 100644
index 000000000..0ae89cfdf
--- /dev/null
+++ b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+LICENSE TERMS OF SKEIN HASH ALGORITHM IMPLEMENTATION
diff --git a/module/icp/algs/skein/skein.c b/module/icp/algs/skein/skein.c
new file mode 100644
index 000000000..0981eee08
--- /dev/null
+++ b/module/icp/algs/skein/skein.c
@@ -0,0 +1,921 @@
+/*
+ * Implementation of the Skein hash function.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#define	SKEIN_PORT_CODE		/* instantiate any code in skein_port.h */
+
+#include <sys/types.h>
+#include <sys/note.h>
+#include <sys/skein.h>		/* get the Skein API definitions   */
+#include "skein_impl.h"		/* get internal definitions */
+
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+
+/* 256-bit Skein */
+/* init the context for a straight hashing operation  */
+int
+Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN_256_STATE_BYTES];
+		uint64_t w[SKEIN_256_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 256:
+		bcopy(SKEIN_256_IV_256, ctx->X, sizeof (ctx->X));
+		break;
+	case 224:
+		bcopy(SKEIN_256_IV_224, ctx->X, sizeof (ctx->X));
+		break;
+	case 160:
+		bcopy(SKEIN_256_IV_160, ctx->X, sizeof (ctx->X));
+		break;
+	case 128:
+		bcopy(SKEIN_256_IV_128, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/* here if there is no precomputed IV value available */
+		/*
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen.
+	 * Set up to process the data message portion of the hash (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_256_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN_256_STATE_BYTES];
+		uint64_t w[SKEIN_256_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		/* hash the key */
+		(void) Skein_256_Update(ctx, key, keyBytes);
+		/* put result into cfg.b[] */
+		(void) Skein_256_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN_256_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);	/* hash result length in bits */
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(256, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+			Skein_256_Process_Block(ctx, ctx->b, 1,
+			    SKEIN_256_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from input
+		 * message data
+		 */
+		if (msgByteCnt > SKEIN_256_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES;
+			Skein_256_Process_Block(ctx, msg, n,
+			    SKEIN_256_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+			msg += n * SKEIN_256_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_256_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_256_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* 512-bit Skein */
+
+/* init the context for a straight hashing operation  */
+int
+Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN_512_STATE_BYTES];
+		uint64_t w[SKEIN_512_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 512:
+		bcopy(SKEIN_512_IV_512, ctx->X, sizeof (ctx->X));
+		break;
+	case 384:
+		bcopy(SKEIN_512_IV_384, ctx->X, sizeof (ctx->X));
+		break;
+	case 256:
+		bcopy(SKEIN_512_IV_256, ctx->X, sizeof (ctx->X));
+		break;
+	case 224:
+		bcopy(SKEIN_512_IV_224, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/*
+		 * here if there is no precomputed IV value available
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen. Set up to process the data message portion of the
+	 * hash (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_512_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN_512_STATE_BYTES];
+		uint64_t w[SKEIN_512_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		(void) Skein_512_Update(ctx, key, keyBytes); /* hash the key */
+		/* put result into cfg.b[] */
+		(void) Skein_512_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN_512_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);	/* hash result length in bits */
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(512, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+			Skein_512_Process_Block(ctx, ctx->b, 1,
+			    SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from input
+		 * message data
+		 */
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+			Skein_512_Process_Block(ctx, msg, n,
+			    SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg += n * SKEIN_512_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(512, &ctx->h, n,
+		    hashVal + i * SKEIN_512_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* 1024-bit Skein */
+
+/* init the context for a straight hashing operation  */
+int
+Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN1024_STATE_BYTES];
+		uint64_t w[SKEIN1024_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 512:
+		bcopy(SKEIN1024_IV_512, ctx->X, sizeof (ctx->X));
+		break;
+	case 384:
+		bcopy(SKEIN1024_IV_384, ctx->X, sizeof (ctx->X));
+		break;
+	case 1024:
+		bcopy(SKEIN1024_IV_1024, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/* here if there is no precomputed IV value available */
+		/*
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen. Set up to process the data message portion of the hash
+	 * (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein1024_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN1024_STATE_BYTES];
+		uint64_t w[SKEIN1024_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		(void) Skein1024_Update(ctx, key, keyBytes); /* hash the key */
+		/* put result into cfg.b[] */
+		(void) Skein1024_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN1024_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	/* hash result length in bits */
+	cfg.w[1] = Skein_Swap64(hashBitLen);
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(1024, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+			Skein1024_Process_Block(ctx, ctx->b, 1,
+			    SKEIN1024_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from
+		 * input message data
+		 */
+		if (msgByteCnt > SKEIN1024_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES;
+			Skein1024_Process_Block(ctx, msg, n,
+			    SKEIN1024_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+			msg += n * SKEIN1024_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN1024_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(1024, &ctx->h, n,
+		    hashVal + i * SKEIN1024_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* Functions to support MAC/tree hashing */
+/* (this code is identical for Optimized and Reference versions) */
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_256_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_512_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* tag as the final block */
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN1024_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+#if	SKEIN_TREE_HASH
+/* just do the OUTPUT stage */
+int
+Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_256_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_256_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_512_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN1024_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN1024_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+#endif
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(Skein_512_Init);
+EXPORT_SYMBOL(Skein_512_InitExt);
+EXPORT_SYMBOL(Skein_512_Update);
+EXPORT_SYMBOL(Skein_512_Final);
+#endif
diff --git a/module/icp/algs/skein/skein_block.c b/module/icp/algs/skein/skein_block.c
new file mode 100644
index 000000000..d2e811963
--- /dev/null
+++ b/module/icp/algs/skein/skein_block.c
@@ -0,0 +1,793 @@
+/*
+ * Implementation of the Skein block functions.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ * Compile-time switches:
+ *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
+ *                    versions use ASM code for block processing
+ *                    [default: use C for all block sizes]
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#include <sys/skein.h>
+#include "skein_impl.h"
+#include <sys/isa_defs.h>	/* for _ILP32 */
+
+#ifndef	SKEIN_USE_ASM
+#define	SKEIN_USE_ASM	(0)	/* default is all C code (no ASM) */
+#endif
+
+#ifndef	SKEIN_LOOP
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
+ * checksum calculations to save stack space.
+ *
+ * Even with no loops unrolled, we still can exceed the 1k stack frame limit
+ * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32).  We can
+ * safely ignore it though, since that the checksum functions will be called
+ * from a worker thread that won't be using much stack.  That's why we have
+ * the #pragma here to ignore the warning.
+ */
+#if defined(_ILP32) || defined(__powerpc)	/* Assume small stack */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+/*
+ * We're running on 32-bit, don't unroll loops to save stack frame space
+ *
+ * Due to the ways the calculations on SKEIN_LOOP are done in
+ * Skein_*_Process_Block(), a value of 111 disables unrolling loops
+ * in any of those functions.
+ */
+#define	SKEIN_LOOP 111
+#else
+/* We're compiling with large stacks */
+#define	SKEIN_LOOP 001		/* default: unroll 256 and 512, but not 1024 */
+#endif
+#endif
+
+/* some useful definitions for code here */
+#define	BLK_BITS	(WCNT*64)
+#define	KW_TWK_BASE	(0)
+#define	KW_KEY_BASE	(3)
+#define	ks		(kw + KW_KEY_BASE)
+#define	ts		(kw + KW_TWK_BASE)
+
+/* no debugging in Illumos version */
+#define	DebugSaveTweak(ctx)
+
+/* Skein_256 */
+#if	!(SKEIN_USE_ASM & 256)
+
+void
+Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{				/* do it in C */
+	enum {
+		WCNT = SKEIN_256_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
+#else
+#define	SKEIN_UNROLL_256 (0)
+#endif
+
+#if	SKEIN_UNROLL_256
+#if	(RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+	/* local copy of context vars, for speed */
+	uint64_t X0, X1, X2, X3;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[4];
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+#endif
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X0 = w[0] + ks[0];	/* do the first full key injection */
+		X1 = w[1] + ks[1] + ts[0];
+		X2 = w[2] + ks[2] + ts[1];
+		X3 = w[3] + ks[3];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);	/* show starting state values */
+
+		blkPtr += SKEIN_256_BLOCK_BYTES;
+
+		/* run the rounds */
+
+#define	Round256(p0, p1, p2, p3, ROT, rNum)                          \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
+
+#if	SKEIN_UNROLL_256 == 0
+#define	R256(p0, p1, p2, p3, ROT, rNum)		/* fully unrolled */	\
+    Round256(p0, p1, p2, p3, ROT, rNum)					\
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define	I256(R)								\
+    X0 += ks[((R) + 1) % 5];	/* inject the key schedule value */	\
+    X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];			\
+    X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];			\
+    X3 += ks[((R) + 4) % 5] + (R) + 1;					\
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R256(p0, p1, p2, p3, ROT, rNum)                             \
+    Round256(p0, p1, p2, p3, ROT, rNum)                             \
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define	I256(R)								\
+	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X1 += ks[r + (R) + 1] + ts[r + (R) + 0];			\
+	X2 += ks[r + (R) + 2] + ts[r + (R) + 1];			\
+	X3 += ks[r + (R) + 3] + r + (R);				\
+	ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */	\
+    ts[r + (R) + 2] = ts[r + (R) - 1];					\
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop thru it */
+		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
+#endif
+		{
+#define	R256_8_rounds(R)                         \
+	R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
+	R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
+	R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
+	R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
+	I256(2 * (R));                           \
+	R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
+	R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
+	R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
+	R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
+	I256(2 * (R) + 1);
+
+			R256_8_rounds(0);
+
+#define	R256_Unroll_R(NN) \
+	((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
+	(SKEIN_UNROLL_256 > (NN)))
+
+#if	R256_Unroll_R(1)
+			R256_8_rounds(1);
+#endif
+#if	R256_Unroll_R(2)
+			R256_8_rounds(2);
+#endif
+#if	R256_Unroll_R(3)
+			R256_8_rounds(3);
+#endif
+#if	R256_Unroll_R(4)
+			R256_8_rounds(4);
+#endif
+#if	R256_Unroll_R(5)
+			R256_8_rounds(5);
+#endif
+#if	R256_Unroll_R(6)
+			R256_8_rounds(6);
+#endif
+#if	R256_Unroll_R(7)
+			R256_8_rounds(7);
+#endif
+#if	R256_Unroll_R(8)
+			R256_8_rounds(8);
+#endif
+#if	R256_Unroll_R(9)
+			R256_8_rounds(9);
+#endif
+#if	R256_Unroll_R(10)
+			R256_8_rounds(10);
+#endif
+#if	R256_Unroll_R(11)
+			R256_8_rounds(11);
+#endif
+#if	R256_Unroll_R(12)
+			R256_8_rounds(12);
+#endif
+#if	R256_Unroll_R(13)
+			R256_8_rounds(13);
+#endif
+#if	R256_Unroll_R(14)
+			R256_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_256 > 14)
+#error  "need more unrolling in Skein_256_Process_Block"
+#endif
+		}
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_256_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein_256_Process_Block);
+}
+
+uint_t
+Skein_256_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_256);
+}
+#endif
+#endif
+
+/* Skein_512 */
+#if	!(SKEIN_USE_ASM & 512)
+void
+Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{				/* do it in C */
+	enum {
+		WCNT = SKEIN_512_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
+#else
+#define	SKEIN_UNROLL_512 (0)
+#endif
+
+#if	SKEIN_UNROLL_512
+#if	(RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+	/* local copy of vars, for speed */
+	uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[8];
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+	Xptr[4] = &X4;
+	Xptr[5] = &X5;
+	Xptr[6] = &X6;
+	Xptr[7] = &X7;
+#endif
+
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X0 = w[0] + ks[0];	/* do the first full key injection */
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);
+		/* run the rounds */
+#define	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
+
+#if	SKEIN_UNROLL_512 == 0
+#define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)	/* unrolled */	\
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define	I512(R)								\
+	X0 += ks[((R) + 1) % 9];	/* inject the key schedule value */\
+	X1 += ks[((R) + 2) % 9];					\
+	X2 += ks[((R) + 3) % 9];					\
+	X3 += ks[((R) + 4) % 9];					\
+	X4 += ks[((R) + 5) % 9];					\
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];			\
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];			\
+	X7 += ks[((R) + 8) % 9] + (R) + 1;				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)			\
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define	I512(R)								\
+	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X1 += ks[r + (R) + 1];						\
+	X2 += ks[r + (R) + 2];						\
+	X3 += ks[r + (R) + 3];						\
+	X4 += ks[r + (R) + 4];						\
+	X5 += ks[r + (R) + 5] + ts[r + (R) + 0];			\
+	X6 += ks[r + (R) + 6] + ts[r + (R) + 1];			\
+	X7 += ks[r + (R) + 7] + r + (R);				\
+	ks[r + (R)+8] = ks[r + (R) - 1];	/* rotate key schedule */\
+	ts[r + (R)+2] = ts[r + (R) - 1];				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop thru it */
+		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
+#endif				/* end of looped code definitions */
+		{
+#define	R512_8_rounds(R)	/* do 8 full rounds */			\
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);		\
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);		\
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);		\
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);		\
+	I512(2 * (R));							\
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);		\
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);		\
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);		\
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);		\
+	I512(2*(R) + 1);		/* and key injection */
+
+			R512_8_rounds(0);
+
+#define	R512_Unroll_R(NN) \
+	((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
+	(SKEIN_UNROLL_512 > (NN)))
+
+#if	R512_Unroll_R(1)
+			R512_8_rounds(1);
+#endif
+#if	R512_Unroll_R(2)
+			R512_8_rounds(2);
+#endif
+#if	R512_Unroll_R(3)
+			R512_8_rounds(3);
+#endif
+#if	R512_Unroll_R(4)
+			R512_8_rounds(4);
+#endif
+#if	R512_Unroll_R(5)
+			R512_8_rounds(5);
+#endif
+#if	R512_Unroll_R(6)
+			R512_8_rounds(6);
+#endif
+#if	R512_Unroll_R(7)
+			R512_8_rounds(7);
+#endif
+#if	R512_Unroll_R(8)
+			R512_8_rounds(8);
+#endif
+#if	R512_Unroll_R(9)
+			R512_8_rounds(9);
+#endif
+#if	R512_Unroll_R(10)
+			R512_8_rounds(10);
+#endif
+#if	R512_Unroll_R(11)
+			R512_8_rounds(11);
+#endif
+#if	R512_Unroll_R(12)
+			R512_8_rounds(12);
+#endif
+#if	R512_Unroll_R(13)
+			R512_8_rounds(13);
+#endif
+#if	R512_Unroll_R(14)
+			R512_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+#endif
+		}
+
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_512_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein_512_Process_Block);
+}
+
+uint_t
+Skein_512_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_512);
+}
+#endif
+#endif
+
+/*  Skein1024 */
+#if	!(SKEIN_USE_ASM & 1024)
+void
+Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{
+	/* do it in C, always looping (unrolled is bigger AND slower!) */
+	enum {
+		WCNT = SKEIN1024_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define	SKEIN_UNROLL_1024 (0)
+#endif
+
+#if	(SKEIN_UNROLL_1024 != 0)
+#if	(RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+
+	/* local copy of vars, for speed */
+	uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
+	    X12, X13, X14, X15;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[16];
+	Xptr[0] = &X00;
+	Xptr[1] = &X01;
+	Xptr[2] = &X02;
+	Xptr[3] = &X03;
+	Xptr[4] = &X04;
+	Xptr[5] = &X05;
+	Xptr[6] = &X06;
+	Xptr[7] = &X07;
+	Xptr[8] = &X08;
+	Xptr[9] = &X09;
+	Xptr[10] = &X10;
+	Xptr[11] = &X11;
+	Xptr[12] = &X12;
+	Xptr[13] = &X13;
+	Xptr[14] = &X14;
+	Xptr[15] = &X15;
+#endif
+
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ctx->X[8];
+		ks[9] = ctx->X[9];
+		ks[10] = ctx->X[10];
+		ks[11] = ctx->X[11];
+		ks[12] = ctx->X[12];
+		ks[13] = ctx->X[13];
+		ks[14] = ctx->X[14];
+		ks[15] = ctx->X[15];
+		ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
+		    ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
+		    ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X00 = w[0] + ks[0];	/* do the first full key injection */
+		X01 = w[1] + ks[1];
+		X02 = w[2] + ks[2];
+		X03 = w[3] + ks[3];
+		X04 = w[4] + ks[4];
+		X05 = w[5] + ks[5];
+		X06 = w[6] + ks[6];
+		X07 = w[7] + ks[7];
+		X08 = w[8] + ks[8];
+		X09 = w[9] + ks[9];
+		X10 = w[10] + ks[10];
+		X11 = w[11] + ks[11];
+		X12 = w[12] + ks[12];
+		X13 = w[13] + ks[13] + ts[0];
+		X14 = w[14] + ks[14] + ts[1];
+		X15 = w[15] + ks[15];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);
+
+#define	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rNum)						\
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
+	X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
+	X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
+	X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
+	X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
+
+#if	SKEIN_UNROLL_1024 == 0
+#define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
+	pE, pF, ROT, rn)						\
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rn)						\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
+
+#define	I1024(R)							\
+	X00 += ks[((R) + 1) % 17];	/* inject the key schedule value */\
+	X01 += ks[((R) + 2) % 17];					\
+	X02 += ks[((R) + 3) % 17];					\
+	X03 += ks[((R) + 4) % 17];					\
+	X04 += ks[((R) + 5) % 17];					\
+	X05 += ks[((R) + 6) % 17];					\
+	X06 += ks[((R) + 7) % 17];					\
+	X07 += ks[((R) + 8) % 17];					\
+	X08 += ks[((R) + 9) % 17];					\
+	X09 += ks[((R) + 10) % 17];					\
+	X10 += ks[((R) + 11) % 17];					\
+	X11 += ks[((R) + 12) % 17];					\
+	X12 += ks[((R) + 13) % 17];					\
+	X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];			\
+	X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];			\
+	X15 += ks[((R) + 16) % 17] + (R) +1;				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
+	pE, pF, ROT, rn)						\
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rn)						\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
+
+#define	I1024(R)							\
+	X00 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X01 += ks[r + (R) + 1];						\
+	X02 += ks[r + (R) + 2];						\
+	X03 += ks[r + (R) + 3];						\
+	X04 += ks[r + (R) + 4];						\
+	X05 += ks[r + (R) + 5];						\
+	X06 += ks[r + (R) + 6];						\
+	X07 += ks[r + (R) + 7];						\
+	X08 += ks[r + (R) + 8];						\
+	X09 += ks[r + (R) + 9];						\
+	X10 += ks[r + (R) + 10];					\
+	X11 += ks[r + (R) + 11];					\
+	X12 += ks[r + (R) + 12];					\
+	X13 += ks[r + (R) + 13] + ts[r + (R) + 0];			\
+	X14 += ks[r + (R) + 14] + ts[r + (R) + 1];			\
+	X15 += ks[r + (R) + 15] +  r + (R);				\
+	ks[r + (R) + 16] = ks[r + (R) - 1];	/* rotate key schedule */\
+	ts[r + (R) + 2] = ts[r + (R) - 1];				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop thru it */
+		for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
+#endif
+		{
+#define	R1024_8_rounds(R)	/* do 8 full rounds */			\
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
+	    14, 15, R1024_0, 8 * (R) + 1);				\
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
+	    08, 01, R1024_1, 8 * (R) + 2);				\
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
+	    10, 09, R1024_2, 8 * (R) + 3);				\
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
+	    12, 07, R1024_3, 8 * (R) + 4);				\
+	I1024(2 * (R));							\
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
+	    14, 15, R1024_4, 8 * (R) + 5);				\
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
+	    08, 01, R1024_5, 8 * (R) + 6);				\
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
+	    10, 09, R1024_6, 8 * (R) + 7);				\
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
+	    12, 07, R1024_7, 8 * (R) + 8);				\
+	I1024(2 * (R) + 1);
+
+			R1024_8_rounds(0);
+
+#define	R1024_Unroll_R(NN)						\
+	((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) ||	\
+	(SKEIN_UNROLL_1024 > (NN)))
+
+#if	R1024_Unroll_R(1)
+			R1024_8_rounds(1);
+#endif
+#if	R1024_Unroll_R(2)
+			R1024_8_rounds(2);
+#endif
+#if	R1024_Unroll_R(3)
+			R1024_8_rounds(3);
+#endif
+#if	R1024_Unroll_R(4)
+			R1024_8_rounds(4);
+#endif
+#if	R1024_Unroll_R(5)
+			R1024_8_rounds(5);
+#endif
+#if	R1024_Unroll_R(6)
+			R1024_8_rounds(6);
+#endif
+#if	R1024_Unroll_R(7)
+			R1024_8_rounds(7);
+#endif
+#if	R1024_Unroll_R(8)
+			R1024_8_rounds(8);
+#endif
+#if	R1024_Unroll_R(9)
+			R1024_8_rounds(9);
+#endif
+#if	R1024_Unroll_R(10)
+			R1024_8_rounds(10);
+#endif
+#if	R1024_Unroll_R(11)
+			R1024_8_rounds(11);
+#endif
+#if	R1024_Unroll_R(12)
+			R1024_8_rounds(12);
+#endif
+#if	R1024_Unroll_R(13)
+			R1024_8_rounds(13);
+#endif
+#if	R1024_Unroll_R(14)
+			R1024_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_1024 > 14)
+#error  "need more unrolling in Skein_1024_Process_Block"
+#endif
+		}
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+
+		ctx->X[0] = X00 ^ w[0];
+		ctx->X[1] = X01 ^ w[1];
+		ctx->X[2] = X02 ^ w[2];
+		ctx->X[3] = X03 ^ w[3];
+		ctx->X[4] = X04 ^ w[4];
+		ctx->X[5] = X05 ^ w[5];
+		ctx->X[6] = X06 ^ w[6];
+		ctx->X[7] = X07 ^ w[7];
+		ctx->X[8] = X08 ^ w[8];
+		ctx->X[9] = X09 ^ w[9];
+		ctx->X[10] = X10 ^ w[10];
+		ctx->X[11] = X11 ^ w[11];
+		ctx->X[12] = X12 ^ w[12];
+		ctx->X[13] = X13 ^ w[13];
+		ctx->X[14] = X14 ^ w[14];
+		ctx->X[15] = X15 ^ w[15];
+
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+		blkPtr += SKEIN1024_BLOCK_BYTES;
+	} while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein1024_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein1024_Process_Block);
+}
+
+uint_t
+Skein1024_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_1024);
+}
+#endif
+#endif
diff --git a/module/icp/algs/skein/skein_impl.h b/module/icp/algs/skein/skein_impl.h
new file mode 100644
index 000000000..e83a06971
--- /dev/null
+++ b/module/icp/algs/skein/skein_impl.h
@@ -0,0 +1,289 @@
+/*
+ * Internal definitions for Skein hashing.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ *
+ * The following compile-time switches may be defined to control some
+ * tradeoffs between speed, code size, error checking, and security.
+ *
+ * The "default" note explains what happens when the switch is not defined.
+ *
+ *  SKEIN_DEBUG            -- make callouts from inside Skein code
+ *                            to examine/display intermediate values.
+ *                            [default: no callouts (no overhead)]
+ *
+ *  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+ *                            code. If not defined, most error checking
+ *                            is disabled (for performance). Otherwise,
+ *                            the switch value is interpreted as:
+ *                                0: use assert()      to flag errors
+ *                                1: return SKEIN_FAIL to flag errors
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef	_SKEIN_IMPL_H_
+#define	_SKEIN_IMPL_H_
+
+#include <sys/skein.h>
+#include "skein_impl.h"
+#include "skein_port.h"
+
+/* determine where we can get bcopy/bzero declarations */
+#ifdef	_KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+
+/*
+ * "Internal" Skein definitions
+ *    -- not needed for sequential hashing API, but will be
+ *           helpful for other uses of Skein (e.g., tree hash mode).
+ *    -- included here so that they can be shared between
+ *           reference and optimized code.
+ */
+
+/* tweak word T[1]: bit field starting positions */
+/* offset 64 because it's the second word  */
+#define	SKEIN_T1_BIT(BIT)	((BIT) - 64)
+
+/* bits 112..118: level in hash tree */
+#define	SKEIN_T1_POS_TREE_LVL	SKEIN_T1_BIT(112)
+/* bit  119: partial final input byte */
+#define	SKEIN_T1_POS_BIT_PAD	SKEIN_T1_BIT(119)
+/* bits 120..125: type field */
+#define	SKEIN_T1_POS_BLK_TYPE	SKEIN_T1_BIT(120)
+/* bits 126: first block flag */
+#define	SKEIN_T1_POS_FIRST	SKEIN_T1_BIT(126)
+/* bit  127: final block flag */
+#define	SKEIN_T1_POS_FINAL	SKEIN_T1_BIT(127)
+
+/* tweak word T[1]: flag bit definition(s) */
+#define	SKEIN_T1_FLAG_FIRST	(((uint64_t)1) << SKEIN_T1_POS_FIRST)
+#define	SKEIN_T1_FLAG_FINAL	(((uint64_t)1) << SKEIN_T1_POS_FINAL)
+#define	SKEIN_T1_FLAG_BIT_PAD	(((uint64_t)1) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define	SKEIN_T1_TREE_LVL_MASK	(((uint64_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define	SKEIN_T1_TREE_LEVEL(n)	(((uint64_t)(n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define	SKEIN_BLK_TYPE_KEY	(0)	/* key, for MAC and KDF */
+#define	SKEIN_BLK_TYPE_CFG	(4)	/* configuration block */
+#define	SKEIN_BLK_TYPE_PERS	(8)	/* personalization string */
+#define	SKEIN_BLK_TYPE_PK	(12)	/* public key (for signature hashing) */
+#define	SKEIN_BLK_TYPE_KDF	(16)	/* key identifier for KDF */
+#define	SKEIN_BLK_TYPE_NONCE	(20)	/* nonce for PRNG */
+#define	SKEIN_BLK_TYPE_MSG	(48)	/* message processing */
+#define	SKEIN_BLK_TYPE_OUT	(63)	/* output stage */
+#define	SKEIN_BLK_TYPE_MASK	(63)	/* bit field mask */
+
+#define	SKEIN_T1_BLK_TYPE(T)	\
+	(((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+/* key, for MAC and KDF */
+#define	SKEIN_T1_BLK_TYPE_KEY	SKEIN_T1_BLK_TYPE(KEY)
+/* configuration block */
+#define	SKEIN_T1_BLK_TYPE_CFG	SKEIN_T1_BLK_TYPE(CFG)
+/* personalization string */
+#define	SKEIN_T1_BLK_TYPE_PERS	SKEIN_T1_BLK_TYPE(PERS)
+/* public key (for digital signature hashing) */
+#define	SKEIN_T1_BLK_TYPE_PK	SKEIN_T1_BLK_TYPE(PK)
+/* key identifier for KDF */
+#define	SKEIN_T1_BLK_TYPE_KDF	SKEIN_T1_BLK_TYPE(KDF)
+/* nonce for PRNG */
+#define	SKEIN_T1_BLK_TYPE_NONCE	SKEIN_T1_BLK_TYPE(NONCE)
+/* message processing */
+#define	SKEIN_T1_BLK_TYPE_MSG	SKEIN_T1_BLK_TYPE(MSG)
+/* output stage */
+#define	SKEIN_T1_BLK_TYPE_OUT	SKEIN_T1_BLK_TYPE(OUT)
+/* field bit mask */
+#define	SKEIN_T1_BLK_TYPE_MASK	SKEIN_T1_BLK_TYPE(MASK)
+
+#define	SKEIN_T1_BLK_TYPE_CFG_FINAL	\
+	(SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define	SKEIN_T1_BLK_TYPE_OUT_FINAL	\
+	(SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define	SKEIN_VERSION		(1)
+
+#ifndef	SKEIN_ID_STRING_LE	/* allow compile-time personalization */
+#define	SKEIN_ID_STRING_LE	(0x33414853)	/* "SHA3" (little-endian) */
+#endif
+
+#define	SKEIN_MK_64(hi32, lo32)	((lo32) + (((uint64_t)(hi32)) << 32))
+#define	SKEIN_SCHEMA_VER	SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE)
+#define	SKEIN_KS_PARITY		SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+
+#define	SKEIN_CFG_STR_LEN	(4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define	SKEIN_CFG_TREE_LEAF_SIZE_POS	(0)
+#define	SKEIN_CFG_TREE_NODE_SIZE_POS	(8)
+#define	SKEIN_CFG_TREE_MAX_LEVEL_POS	(16)
+
+#define	SKEIN_CFG_TREE_LEAF_SIZE_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define	SKEIN_CFG_TREE_NODE_SIZE_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define	SKEIN_CFG_TREE_MAX_LEVEL_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define	SKEIN_CFG_TREE_INFO(leaf, node, maxLvl)			\
+	((((uint64_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |	\
+	(((uint64_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) |	\
+	(((uint64_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS))
+
+/* use as treeInfo in InitExt() call for sequential processing */
+#define	SKEIN_CFG_TREE_INFO_SEQUENTIAL	SKEIN_CFG_TREE_INFO(0, 0, 0)
+
+/*
+ * Skein macros for getting/setting tweak words, etc.
+ * These are useful for partial input bytes, hash tree init/update, etc.
+ */
+#define	Skein_Get_Tweak(ctxPtr, TWK_NUM)	((ctxPtr)->h.T[TWK_NUM])
+#define	Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal)		\
+	do {						\
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);	\
+		_NOTE(CONSTCOND)			\
+	} while (0)
+
+#define	Skein_Get_T0(ctxPtr)		Skein_Get_Tweak(ctxPtr, 0)
+#define	Skein_Get_T1(ctxPtr)		Skein_Get_Tweak(ctxPtr, 1)
+#define	Skein_Set_T0(ctxPtr, T0)	Skein_Set_Tweak(ctxPtr, 0, T0)
+#define	Skein_Set_T1(ctxPtr, T1)	Skein_Set_Tweak(ctxPtr, 1, T1)
+
+/* set both tweak words at once */
+#define	Skein_Set_T0_T1(ctxPtr, T0, T1)		\
+	do {					\
+		Skein_Set_T0(ctxPtr, (T0));	\
+		Skein_Set_T1(ctxPtr, (T1));	\
+		_NOTE(CONSTCOND)		\
+	} while (0)
+
+#define	Skein_Set_Type(ctxPtr, BLK_TYPE)	\
+	Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/*
+ * set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0;
+ */
+#define	Skein_Start_New_Type(ctxPtr, BLK_TYPE)				\
+	do {								\
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST |	\
+		    SKEIN_T1_BLK_TYPE_ ## BLK_TYPE);			\
+		(ctxPtr)->h.bCnt = 0;	\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+#define	Skein_Clear_First_Flag(hdr)					\
+	do {								\
+		(hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;			\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+#define	Skein_Set_Bit_Pad_Flag(hdr)					\
+	do {								\
+		(hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;			\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+#define	Skein_Set_Tree_Level(hdr, height)				\
+	do {								\
+		(hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);		\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+/*
+ * "Internal" Skein definitions for debugging and error checking
+ * Note: in Illumos we always disable debugging features.
+ */
+#define	Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr)
+#define	Skein_Show_Round(bits, ctx, r, X)
+#define	Skein_Show_R_Ptr(bits, ctx, r, X_ptr)
+#define	Skein_Show_Final(bits, ctx, cnt, outPtr)
+#define	Skein_Show_Key(bits, ctx, key, keyBytes)
+
+/* run-time checks (e.g., bad params, uninitialized context)? */
+#ifndef	SKEIN_ERR_CHECK
+/* default: ignore all Asserts, for performance */
+#define	Skein_Assert(x, retCode)
+#define	Skein_assert(x)
+#elif	defined(SKEIN_ASSERT)
+#include <sys/debug.h>
+#define	Skein_Assert(x, retCode)	ASSERT(x)
+#define	Skein_assert(x)			ASSERT(x)
+#else
+#include <sys/debug.h>
+/*  caller error */
+#define	Skein_Assert(x, retCode)		\
+	do {					\
+		if (!(x))			\
+			return (retCode);	\
+		_NOTE(CONSTCOND)		\
+	} while (0)
+/* internal error */
+#define	Skein_assert(x)	ASSERT(x)
+#endif
+
+/*
+ * Skein block function constants (shared across Ref and Opt code)
+ */
+enum {
+	/* Skein_256 round rotation constants */
+	R_256_0_0 = 14, R_256_0_1 = 16,
+	R_256_1_0 = 52, R_256_1_1 = 57,
+	R_256_2_0 = 23, R_256_2_1 = 40,
+	R_256_3_0 = 5, R_256_3_1 = 37,
+	R_256_4_0 = 25, R_256_4_1 = 33,
+	R_256_5_0 = 46, R_256_5_1 = 12,
+	R_256_6_0 = 58, R_256_6_1 = 22,
+	R_256_7_0 = 32, R_256_7_1 = 32,
+
+	/* Skein_512 round rotation constants */
+	R_512_0_0 = 46, R_512_0_1 = 36, R_512_0_2 = 19, R_512_0_3 = 37,
+	R_512_1_0 = 33, R_512_1_1 = 27, R_512_1_2 = 14, R_512_1_3 = 42,
+	R_512_2_0 = 17, R_512_2_1 = 49, R_512_2_2 = 36, R_512_2_3 = 39,
+	R_512_3_0 = 44, R_512_3_1 = 9, R_512_3_2 = 54, R_512_3_3 = 56,
+	R_512_4_0 = 39, R_512_4_1 = 30, R_512_4_2 = 34, R_512_4_3 = 24,
+	R_512_5_0 = 13, R_512_5_1 = 50, R_512_5_2 = 10, R_512_5_3 = 17,
+	R_512_6_0 = 25, R_512_6_1 = 29, R_512_6_2 = 39, R_512_6_3 = 43,
+	R_512_7_0 = 8, R_512_7_1 = 35, R_512_7_2 = 56, R_512_7_3 = 22,
+
+	/* Skein1024 round rotation constants */
+	R1024_0_0 = 24, R1024_0_1 = 13, R1024_0_2 = 8, R1024_0_3 =
+	    47, R1024_0_4 = 8, R1024_0_5 = 17, R1024_0_6 = 22, R1024_0_7 = 37,
+	R1024_1_0 = 38, R1024_1_1 = 19, R1024_1_2 = 10, R1024_1_3 =
+	    55, R1024_1_4 = 49, R1024_1_5 = 18, R1024_1_6 = 23, R1024_1_7 = 52,
+	R1024_2_0 = 33, R1024_2_1 = 4, R1024_2_2 = 51, R1024_2_3 =
+	    13, R1024_2_4 = 34, R1024_2_5 = 41, R1024_2_6 = 59, R1024_2_7 = 17,
+	R1024_3_0 = 5, R1024_3_1 = 20, R1024_3_2 = 48, R1024_3_3 =
+	    41, R1024_3_4 = 47, R1024_3_5 = 28, R1024_3_6 = 16, R1024_3_7 = 25,
+	R1024_4_0 = 41, R1024_4_1 = 9, R1024_4_2 = 37, R1024_4_3 =
+	    31, R1024_4_4 = 12, R1024_4_5 = 47, R1024_4_6 = 44, R1024_4_7 = 30,
+	R1024_5_0 = 16, R1024_5_1 = 34, R1024_5_2 = 56, R1024_5_3 =
+	    51, R1024_5_4 = 4, R1024_5_5 = 53, R1024_5_6 = 42, R1024_5_7 = 41,
+	R1024_6_0 = 31, R1024_6_1 = 44, R1024_6_2 = 47, R1024_6_3 =
+	    46, R1024_6_4 = 19, R1024_6_5 = 42, R1024_6_6 = 44, R1024_6_7 = 25,
+	R1024_7_0 = 9, R1024_7_1 = 48, R1024_7_2 = 35, R1024_7_3 =
+	    52, R1024_7_4 = 23, R1024_7_5 = 31, R1024_7_6 = 37, R1024_7_7 = 20
+};
+
+/* number of rounds for the different block sizes */
+#define	SKEIN_256_ROUNDS_TOTAL	(72)
+#define	SKEIN_512_ROUNDS_TOTAL	(72)
+#define	SKEIN1024_ROUNDS_TOTAL	(80)
+
+
+extern const uint64_t SKEIN_256_IV_128[];
+extern const uint64_t SKEIN_256_IV_160[];
+extern const uint64_t SKEIN_256_IV_224[];
+extern const uint64_t SKEIN_256_IV_256[];
+extern const uint64_t SKEIN_512_IV_128[];
+extern const uint64_t SKEIN_512_IV_160[];
+extern const uint64_t SKEIN_512_IV_224[];
+extern const uint64_t SKEIN_512_IV_256[];
+extern const uint64_t SKEIN_512_IV_384[];
+extern const uint64_t SKEIN_512_IV_512[];
+extern const uint64_t SKEIN1024_IV_384[];
+extern const uint64_t SKEIN1024_IV_512[];
+extern const uint64_t SKEIN1024_IV_1024[];
+
+#endif	/* _SKEIN_IMPL_H_ */
diff --git a/module/icp/algs/skein/skein_iv.c b/module/icp/algs/skein/skein_iv.c
new file mode 100644
index 000000000..140d38f76
--- /dev/null
+++ b/module/icp/algs/skein/skein_iv.c
@@ -0,0 +1,185 @@
+/*
+ * Pre-computed Skein IVs
+ *
+ * NOTE: these values are not "magic" constants, but
+ * are generated using the Threefish block function.
+ * They are pre-computed here only for speed; i.e., to
+ * avoid the need for a Threefish call during Init().
+ *
+ * The IV for any fixed hash length may be pre-computed.
+ * Only the most common values are included here.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+/*
+ * Illumos implementation note: these constants are for Skein v1.3 as per:
+ * http://www.skein-hash.info/sites/default/files/skein1.3.pdf
+ */
+
+#include <sys/skein.h>		/* get Skein macros and types */
+#include "skein_impl.h"		/* get internal definitions */
+
+#define	MK_64 SKEIN_MK_64
+
+/* blkSize =  256 bits. hashSize =  128 bits */
+const uint64_t SKEIN_256_IV_128[] = {
+	MK_64(0xE1111906, 0x964D7260),
+	MK_64(0x883DAAA7, 0x7C8D811C),
+	MK_64(0x10080DF4, 0x91960F7A),
+	MK_64(0xCCF7DDE5, 0xB45BC1C2)
+};
+
+/* blkSize =  256 bits. hashSize =  160 bits */
+const uint64_t SKEIN_256_IV_160[] = {
+	MK_64(0x14202314, 0x72825E98),
+	MK_64(0x2AC4E9A2, 0x5A77E590),
+	MK_64(0xD47A5856, 0x8838D63E),
+	MK_64(0x2DD2E496, 0x8586AB7D)
+};
+
+/* blkSize =  256 bits. hashSize =  224 bits */
+const uint64_t SKEIN_256_IV_224[] = {
+	MK_64(0xC6098A8C, 0x9AE5EA0B),
+	MK_64(0x876D5686, 0x08C5191C),
+	MK_64(0x99CB88D7, 0xD7F53884),
+	MK_64(0x384BDDB1, 0xAEDDB5DE)
+};
+
+/* blkSize =  256 bits. hashSize =  256 bits */
+const uint64_t SKEIN_256_IV_256[] = {
+	MK_64(0xFC9DA860, 0xD048B449),
+	MK_64(0x2FCA6647, 0x9FA7D833),
+	MK_64(0xB33BC389, 0x6656840F),
+	MK_64(0x6A54E920, 0xFDE8DA69)
+};
+
+/* blkSize =  512 bits. hashSize =  128 bits */
+const uint64_t SKEIN_512_IV_128[] = {
+	MK_64(0xA8BC7BF3, 0x6FBF9F52),
+	MK_64(0x1E9872CE, 0xBD1AF0AA),
+	MK_64(0x309B1790, 0xB32190D3),
+	MK_64(0xBCFBB854, 0x3F94805C),
+	MK_64(0x0DA61BCD, 0x6E31B11B),
+	MK_64(0x1A18EBEA, 0xD46A32E3),
+	MK_64(0xA2CC5B18, 0xCE84AA82),
+	MK_64(0x6982AB28, 0x9D46982D)
+};
+
+/* blkSize =  512 bits. hashSize =  160 bits */
+const uint64_t SKEIN_512_IV_160[] = {
+	MK_64(0x28B81A2A, 0xE013BD91),
+	MK_64(0xC2F11668, 0xB5BDF78F),
+	MK_64(0x1760D8F3, 0xF6A56F12),
+	MK_64(0x4FB74758, 0x8239904F),
+	MK_64(0x21EDE07F, 0x7EAF5056),
+	MK_64(0xD908922E, 0x63ED70B8),
+	MK_64(0xB8EC76FF, 0xECCB52FA),
+	MK_64(0x01A47BB8, 0xA3F27A6E)
+};
+
+/* blkSize =  512 bits. hashSize =  224 bits */
+const uint64_t SKEIN_512_IV_224[] = {
+	MK_64(0xCCD06162, 0x48677224),
+	MK_64(0xCBA65CF3, 0xA92339EF),
+	MK_64(0x8CCD69D6, 0x52FF4B64),
+	MK_64(0x398AED7B, 0x3AB890B4),
+	MK_64(0x0F59D1B1, 0x457D2BD0),
+	MK_64(0x6776FE65, 0x75D4EB3D),
+	MK_64(0x99FBC70E, 0x997413E9),
+	MK_64(0x9E2CFCCF, 0xE1C41EF7)
+};
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const uint64_t SKEIN_512_IV_256[] = {
+	MK_64(0xCCD044A1, 0x2FDB3E13),
+	MK_64(0xE8359030, 0x1A79A9EB),
+	MK_64(0x55AEA061, 0x4F816E6F),
+	MK_64(0x2A2767A4, 0xAE9B94DB),
+	MK_64(0xEC06025E, 0x74DD7683),
+	MK_64(0xE7A436CD, 0xC4746251),
+	MK_64(0xC36FBAF9, 0x393AD185),
+	MK_64(0x3EEDBA18, 0x33EDFC13)
+};
+
+/* blkSize =  512 bits. hashSize =  384 bits */
+const uint64_t SKEIN_512_IV_384[] = {
+	MK_64(0xA3F6C6BF, 0x3A75EF5F),
+	MK_64(0xB0FEF9CC, 0xFD84FAA4),
+	MK_64(0x9D77DD66, 0x3D770CFE),
+	MK_64(0xD798CBF3, 0xB468FDDA),
+	MK_64(0x1BC4A666, 0x8A0E4465),
+	MK_64(0x7ED7D434, 0xE5807407),
+	MK_64(0x548FC1AC, 0xD4EC44D6),
+	MK_64(0x266E1754, 0x6AA18FF8)
+};
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+const uint64_t SKEIN_512_IV_512[] = {
+	MK_64(0x4903ADFF, 0x749C51CE),
+	MK_64(0x0D95DE39, 0x9746DF03),
+	MK_64(0x8FD19341, 0x27C79BCE),
+	MK_64(0x9A255629, 0xFF352CB1),
+	MK_64(0x5DB62599, 0xDF6CA7B0),
+	MK_64(0xEABE394C, 0xA9D5C3F4),
+	MK_64(0x991112C7, 0x1A75B523),
+	MK_64(0xAE18A40B, 0x660FCC33)
+};
+
+/* blkSize = 1024 bits. hashSize =  384 bits */
+const uint64_t SKEIN1024_IV_384[] = {
+	MK_64(0x5102B6B8, 0xC1894A35),
+	MK_64(0xFEEBC9E3, 0xFE8AF11A),
+	MK_64(0x0C807F06, 0xE32BED71),
+	MK_64(0x60C13A52, 0xB41A91F6),
+	MK_64(0x9716D35D, 0xD4917C38),
+	MK_64(0xE780DF12, 0x6FD31D3A),
+	MK_64(0x797846B6, 0xC898303A),
+	MK_64(0xB172C2A8, 0xB3572A3B),
+	MK_64(0xC9BC8203, 0xA6104A6C),
+	MK_64(0x65909338, 0xD75624F4),
+	MK_64(0x94BCC568, 0x4B3F81A0),
+	MK_64(0x3EBBF51E, 0x10ECFD46),
+	MK_64(0x2DF50F0B, 0xEEB08542),
+	MK_64(0x3B5A6530, 0x0DBC6516),
+	MK_64(0x484B9CD2, 0x167BBCE1),
+	MK_64(0x2D136947, 0xD4CBAFEA)
+};
+
+/* blkSize = 1024 bits. hashSize =  512 bits */
+const uint64_t SKEIN1024_IV_512[] = {
+	MK_64(0xCAEC0E5D, 0x7C1B1B18),
+	MK_64(0xA01B0E04, 0x5F03E802),
+	MK_64(0x33840451, 0xED912885),
+	MK_64(0x374AFB04, 0xEAEC2E1C),
+	MK_64(0xDF25A0E2, 0x813581F7),
+	MK_64(0xE4004093, 0x8B12F9D2),
+	MK_64(0xA662D539, 0xC2ED39B6),
+	MK_64(0xFA8B85CF, 0x45D8C75A),
+	MK_64(0x8316ED8E, 0x29EDE796),
+	MK_64(0x053289C0, 0x2E9F91B8),
+	MK_64(0xC3F8EF1D, 0x6D518B73),
+	MK_64(0xBDCEC3C4, 0xD5EF332E),
+	MK_64(0x549A7E52, 0x22974487),
+	MK_64(0x67070872, 0x5B749816),
+	MK_64(0xB9CD28FB, 0xF0581BD1),
+	MK_64(0x0E2940B8, 0x15804974)
+};
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const uint64_t SKEIN1024_IV_1024[] = {
+	MK_64(0xD593DA07, 0x41E72355),
+	MK_64(0x15B5E511, 0xAC73E00C),
+	MK_64(0x5180E5AE, 0xBAF2C4F0),
+	MK_64(0x03BD41D3, 0xFCBCAFAF),
+	MK_64(0x1CAEC6FD, 0x1983A898),
+	MK_64(0x6E510B8B, 0xCDD0589F),
+	MK_64(0x77E2BDFD, 0xC6394ADA),
+	MK_64(0xC11E1DB5, 0x24DCB0A3),
+	MK_64(0xD6D14AF9, 0xC6329AB5),
+	MK_64(0x6A9B0BFC, 0x6EB67E0D),
+	MK_64(0x9243C60D, 0xCCFF1332),
+	MK_64(0x1A1F1DDE, 0x743F02D4),
+	MK_64(0x0996753C, 0x10ED0BB8),
+	MK_64(0x6572DD22, 0xF2B4969A),
+	MK_64(0x61FD3062, 0xD00A579A),
+	MK_64(0x1DE0536E, 0x8682E539)
+};
diff --git a/module/icp/algs/skein/skein_port.h b/module/icp/algs/skein/skein_port.h
new file mode 100644
index 000000000..1b0225236
--- /dev/null
+++ b/module/icp/algs/skein/skein_port.h
@@ -0,0 +1,128 @@
+/*
+ * Platform-specific definitions for Skein hash function.
+ *
+ * Source code author: Doug Whiting, 2008.
+ *
+ * This algorithm and source code is released to the public domain.
+ *
+ * Many thanks to Brian Gladman for his portable header files.
+ *
+ * To port Skein to an "unsupported" platform, change the definitions
+ * in this file appropriately.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef	_SKEIN_PORT_H_
+#define	_SKEIN_PORT_H_
+
+#include <sys/types.h>	/* get integer type definitions */
+#include <sys/systm.h>	/* for bcopy() */
+
+#ifndef	RotL_64
+#define	RotL_64(x, N)	(((x) << (N)) | ((x) >> (64 - (N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs. The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef	SKEIN_NEED_SWAP		/* compile-time "override" for endianness? */
+
+#include <sys/isa_defs.h>	/* get endianness selection */
+
+#define	PLATFORM_MUST_ALIGN	_ALIGNMENT_REQUIRED
+#if	defined(_BIG_ENDIAN)
+/* here for big-endian CPUs */
+#define	SKEIN_NEED_SWAP   (1)
+#else
+/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define	SKEIN_NEED_SWAP   (0)
+#if	PLATFORM_MUST_ALIGN == 0	/* ok to use "fast" versions? */
+#define	Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt)
+#define	Skein_Get64_LSB_First(dst64, src08, wCnt) \
+	bcopy(src08, dst64, 8 * (wCnt))
+#endif
+#endif
+
+#endif				/* ifndef SKEIN_NEED_SWAP */
+
+/*
+ * Provide any definitions still needed.
+ */
+#ifndef	Skein_Swap64	/* swap for big-endian, nop for little-endian */
+#if	SKEIN_NEED_SWAP
+#define	Skein_Swap64(w64)				\
+	(((((uint64_t)(w64)) & 0xFF) << 56) |		\
+	(((((uint64_t)(w64)) >> 8) & 0xFF) << 48) |	\
+	(((((uint64_t)(w64)) >> 16) & 0xFF) << 40) |	\
+	(((((uint64_t)(w64)) >> 24) & 0xFF) << 32) |	\
+	(((((uint64_t)(w64)) >> 32) & 0xFF) << 24) |	\
+	(((((uint64_t)(w64)) >> 40) & 0xFF) << 16) |	\
+	(((((uint64_t)(w64)) >> 48) & 0xFF) << 8) |	\
+	(((((uint64_t)(w64)) >> 56) & 0xFF)))
+#else
+#define	Skein_Swap64(w64)  (w64)
+#endif
+#endif				/* ifndef Skein_Swap64 */
+
+#ifndef	Skein_Put64_LSB_First
+void
+Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt)
+#ifdef	SKEIN_PORT_CODE		/* instantiate the function code here? */
+{
+	/*
+	 * this version is fully portable (big-endian or little-endian),
+	 * but slow
+	 */
+	size_t n;
+
+	for (n = 0; n < bCnt; n++)
+		dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7)));
+}
+#else
+;				/* output only the function prototype */
+#endif
+#endif				/* ifndef Skein_Put64_LSB_First */
+
+#ifndef	Skein_Get64_LSB_First
+void
+Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt)
+#ifdef	SKEIN_PORT_CODE		/* instantiate the function code here? */
+{
+	/*
+	 * this version is fully portable (big-endian or little-endian),
+	 * but slow
+	 */
+	size_t n;
+
+	for (n = 0; n < 8 * wCnt; n += 8)
+		dst[n / 8] = (((uint64_t)src[n])) +
+		    (((uint64_t)src[n + 1]) << 8) +
+		    (((uint64_t)src[n + 2]) << 16) +
+		    (((uint64_t)src[n + 3]) << 24) +
+		    (((uint64_t)src[n + 4]) << 32) +
+		    (((uint64_t)src[n + 5]) << 40) +
+		    (((uint64_t)src[n + 6]) << 48) +
+		    (((uint64_t)src[n + 7]) << 56);
+}
+#else
+;				/* output only the function prototype */
+#endif
+#endif				/* ifndef Skein_Get64_LSB_First */
+
+#endif	/* _SKEIN_PORT_H_ */
diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S
index b689c9022..d55c5eb48 100644
--- a/module/icp/asm-x86_64/sha2/sha256_impl.S
+++ b/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -62,11 +62,9 @@
  */
 
 /*
- * This file was generated by a perl script (sha512-x86_64.pl) that could
- * be used to generate sha256 and sha512 variants from the same code base.
- * For our purposes, we only need sha256 and so getting the perl script to
- * run as part of the build process seemed superfluous. The comments from
- * the original file have been pasted above.
+ * This file was generated by a perl script (sha512-x86_64.pl) that were 
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
  */
 
 #if defined(lint) || defined(__lint)
diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S
new file mode 100644
index 000000000..24a41745b
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -0,0 +1,2083 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <[email protected]> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).  Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA512TransformBlocks)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	mov	%rsp,%rbp		# copy %rsp
+	shl	$4,%rdx		# num*16
+	sub	$16*8+4*8,%rsp
+	lea	(%rsi,%rdx,8),%rdx	# inp+num*16*8
+	and	$-64,%rsp		# align stack frame
+	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
+	mov	%rdi,16*8+0*8(%rsp)		# save ctx, 1st arg
+	mov	%rsi,16*8+1*8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16*8+2*8(%rsp)		# save end pointer, "3rd" arg
+	mov	%rbp,16*8+3*8(%rsp)		# save copy of %rsp
+
+	/.picmeup %rbp
+	/ The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+	/ the address of the "next" instruction into the target register
+	/ (%rbp).  This generates these 2 instructions:
+	lea	.Llea(%rip),%rbp
+	/nop	/ .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+	lea	K512-.(%rbp),%rbp
+
+	mov	8*0(%rdi),%rax
+	mov	8*1(%rdi),%rbx
+	mov	8*2(%rdi),%rcx
+	mov	8*3(%rdi),%rdx
+	mov	8*4(%rdi),%r8
+	mov	8*5(%rdi),%r9
+	mov	8*6(%rdi),%r10
+	mov	8*7(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xor	%rdi,%rdi
+	mov	8*0(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*1(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*2(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*3(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*4(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*5(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*6(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*7(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	8*8(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*9(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*10(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*11(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*12(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*13(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*14(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*15(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	mov	8(%rsp),%r13
+	mov	112(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	72(%rsp),%r12
+
+	add	0(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	16(%rsp),%r13
+	mov	120(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	80(%rsp),%r12
+
+	add	8(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	24(%rsp),%r13
+	mov	0(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	88(%rsp),%r12
+
+	add	16(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	32(%rsp),%r13
+	mov	8(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	96(%rsp),%r12
+
+	add	24(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	40(%rsp),%r13
+	mov	16(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	104(%rsp),%r12
+
+	add	32(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	48(%rsp),%r13
+	mov	24(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	112(%rsp),%r12
+
+	add	40(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	56(%rsp),%r13
+	mov	32(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	120(%rsp),%r12
+
+	add	48(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	64(%rsp),%r13
+	mov	40(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	0(%rsp),%r12
+
+	add	56(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	72(%rsp),%r13
+	mov	48(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	8(%rsp),%r12
+
+	add	64(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	80(%rsp),%r13
+	mov	56(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	16(%rsp),%r12
+
+	add	72(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	88(%rsp),%r13
+	mov	64(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	24(%rsp),%r12
+
+	add	80(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	96(%rsp),%r13
+	mov	72(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	32(%rsp),%r12
+
+	add	88(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	104(%rsp),%r13
+	mov	80(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	40(%rsp),%r12
+
+	add	96(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	112(%rsp),%r13
+	mov	88(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	48(%rsp),%r12
+
+	add	104(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	120(%rsp),%r13
+	mov	96(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	56(%rsp),%r12
+
+	add	112(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	0(%rsp),%r13
+	mov	104(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	64(%rsp),%r12
+
+	add	120(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	cmp	$80,%rdi
+	jb	.Lrounds_16_xx
+
+	mov	16*8+0*8(%rsp),%rdi
+	lea	16*8(%rsi),%rsi
+
+	add	8*0(%rdi),%rax
+	add	8*1(%rdi),%rbx
+	add	8*2(%rdi),%rcx
+	add	8*3(%rdi),%rdx
+	add	8*4(%rdi),%r8
+	add	8*5(%rdi),%r9
+	add	8*6(%rdi),%r10
+	add	8*7(%rdi),%r11
+
+	cmp	16*8+2*8(%rsp),%rsi
+
+	mov	%rax,8*0(%rdi)
+	mov	%rbx,8*1(%rdi)
+	mov	%rcx,8*2(%rdi)
+	mov	%rdx,8*3(%rdi)
+	mov	%r8,8*4(%rdi)
+	mov	%r9,8*5(%rdi)
+	mov	%r10,8*6(%rdi)
+	mov	%r11,8*7(%rdi)
+	jb	.Lloop
+
+	mov	16*8+3*8(%rsp),%rsp
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+SET_SIZE(SHA512TransformBlocks)
+
+.align	64
+.type	K512,@object
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif /* !lint && !__lint */
diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c
index 7dd5dbf42..aa63e431f 100644
--- a/module/icp/illumos-crypto.c
+++ b/module/icp/illumos-crypto.c
@@ -109,8 +109,10 @@
 void __exit
 icp_fini(void)
 {
+	skein_mod_fini();
 	sha2_mod_fini();
 	sha1_mod_fini();
+	edonr_mod_fini();
 	aes_mod_fini();
 	kcf_sched_destroy();
 	kcf_prov_tab_destroy();
@@ -139,8 +141,10 @@ icp_init(void)
 
 	/* initialize algorithms */
 	aes_mod_init();
+	edonr_mod_init();
 	sha1_mod_init();
 	sha2_mod_init();
+	skein_mod_init();
 
 	return (0);
 }
diff --git a/module/icp/include/sha2/sha2.h b/module/icp/include/sha2/sha2.h
deleted file mode 100644
index 8e53987a7..000000000
--- a/module/icp/include/sha2/sha2.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * CDDL HEADER START
- *
- * The contents of this file are subject to the terms of the
- * Common Development and Distribution License (the "License").
- * You may not use this file except in compliance with the License.
- *
- * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
- * See the License for the specific language governing permissions
- * and limitations under the License.
- *
- * When distributing Covered Code, include this CDDL HEADER in each
- * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
- * If applicable, add the following below this CDDL HEADER, with the
- * fields enclosed by brackets "[]" replaced with your own identifying
- * information: Portions Copyright [yyyy] [name of copyright owner]
- *
- * CDDL HEADER END
- */
-/*
- * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
- * Use is subject to license terms.
- */
-/* Copyright 2013 Saso Kiselkov.  All rights reserved. */
-
-#ifndef _SYS_SHA2_H
-#define	_SYS_SHA2_H
-
-#include <sys/types.h>		/* for uint_* */
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-#define	SHA2_HMAC_MIN_KEY_LEN	1	/* SHA2-HMAC min key length in bytes */
-#define	SHA2_HMAC_MAX_KEY_LEN	INT_MAX	/* SHA2-HMAC max key length in bytes */
-
-#define	SHA256_DIGEST_LENGTH	32	/* SHA256 digest length in bytes */
-
-#define	SHA256_HMAC_BLOCK_SIZE	64	/* SHA256-HMAC block size */
-
-#define	SHA256			0
-#define	SHA256_HMAC		1
-#define	SHA256_HMAC_GEN		2
-
-/*
- * SHA2 context.
- * The contents of this structure are a private interface between the
- * Init/Update/Final calls of the functions defined below.
- * Callers must never attempt to read or write any of the fields
- * in this structure directly.
- */
-typedef struct 	{
-	uint32_t algotype;		/* Algorithm Type */
-
-	/* state (ABCDEFGH) */
-	union {
-		uint32_t s32[8];	/* for SHA256 */
-		uint64_t s64[8];	/* for SHA384/512 */
-	} state;
-	/* number of bits */
-	union {
-		uint32_t c32[2];	/* for SHA256 , modulo 2^64 */
-		uint64_t c64[2];	/* for SHA384/512, modulo 2^128 */
-	} count;
-	union {
-		uint8_t		buf8[128];	/* undigested input */
-		uint32_t	buf32[32];	/* realigned input */
-		uint64_t	buf64[16];	/* realigned input */
-	} buf_un;
-} SHA2_CTX;
-
-typedef SHA2_CTX SHA256_CTX;
-typedef SHA2_CTX SHA384_CTX;
-typedef SHA2_CTX SHA512_CTX;
-
-extern void SHA2Init(uint64_t mech, SHA2_CTX *);
-
-extern void SHA2Update(SHA2_CTX *, const void *, size_t);
-
-extern void SHA2Final(void *, SHA2_CTX *);
-
-extern void SHA256Init(SHA256_CTX *);
-
-extern void SHA256Update(SHA256_CTX *, const void *, size_t);
-
-extern void SHA256Final(void *, SHA256_CTX *);
-
-#ifdef _SHA2_IMPL
-/*
- * The following types/functions are all private to the implementation
- * of the SHA2 functions and must not be used by consumers of the interface
- */
-
-/*
- * List of support mechanisms in this module.
- *
- * It is important to note that in the module, division or modulus calculations
- * are used on the enumerated type to determine which mechanism is being used;
- * therefore, changing the order or additional mechanisms should be done
- * carefully
- */
-typedef enum sha2_mech_type {
-	SHA256_MECH_INFO_TYPE,		/* SUN_CKM_SHA256 */
-	SHA256_HMAC_MECH_INFO_TYPE,	/* SUN_CKM_SHA256_HMAC */
-	SHA256_HMAC_GEN_MECH_INFO_TYPE,	/* SUN_CKM_SHA256_HMAC_GENERAL */
-} sha2_mech_type_t;
-
-#endif /* _SHA2_IMPL */
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif /* _SYS_SHA2_H */
diff --git a/module/icp/include/sha2/sha2_impl.h b/module/icp/include/sha2/sha2_impl.h
index bb42c3cd4..b9768d344 100644
--- a/module/icp/include/sha2/sha2_impl.h
+++ b/module/icp/include/sha2/sha2_impl.h
@@ -26,6 +26,8 @@
 #ifndef	_SHA2_IMPL_H
 #define	_SHA2_IMPL_H
 
+#include <sys/sha2.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/module/icp/io/edonr_mod.c b/module/icp/io/edonr_mod.c
new file mode 100644
index 000000000..19b5c963d
--- /dev/null
+++ b/module/icp/io/edonr_mod.c
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/edonr.h>
+
+/*
+ * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic
+ * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose
+ * cryptographic use. Users of Edon-R must interface directly to this module.
+ */
+
+static struct modlmisc modlmisc = {
+	&mod_cryptoops,
+	"Edon-R Message-Digest Algorithm"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlmisc, NULL}
+};
+
+int
+edonr_mod_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	return (0);
+}
+
+int
+edonr_mod_fini(void) {
+	return (mod_remove(&modlinkage));
+}
diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c
index be0f7a42c..3913d7618 100644
--- a/module/icp/io/sha2_mod.c
+++ b/module/icp/io/sha2_mod.c
@@ -30,7 +30,7 @@
 #include <sys/crypto/spi.h>
 #include <sys/crypto/icp.h>
 #define	_SHA2_IMPL
-#include <sha2/sha2.h>
+#include <sys/sha2.h>
 #include <sha2/sha2_impl.h>
 
 /*
diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c
new file mode 100644
index 000000000..e909a7e31
--- /dev/null
+++ b/module/icp/io/skein_mod.c
@@ -0,0 +1,721 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#define	SKEIN_MODULE_IMPL
+#include <sys/skein.h>
+
+/*
+ * Like the sha2 module, we create the skein module with two modlinkages:
+ * - modlmisc to allow direct calls to Skein_* API functions.
+ * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF).
+ */
+static struct modlmisc modlmisc = {
+	&mod_cryptoops,
+	"Skein Message-Digest Algorithm"
+};
+
+static struct modlcrypto modlcrypto = {
+	&mod_cryptoops,
+	"Skein Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlmisc, &modlcrypto, NULL}
+};
+
+static crypto_mech_info_t skein_mech_info_tab[] = {
+	{CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	{CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	{CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void skein_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t skein_control_ops = {
+	skein_provider_status
+};
+
+static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_req_handle_t);
+static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+
+static crypto_digest_ops_t skein_digest_ops = {
+	skein_digest_init,
+	skein_digest,
+	skein_update,
+	NULL,
+	skein_final,
+	skein_digest_atomic
+};
+
+static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t skein_mac_ops = {
+	skein_mac_init,
+	NULL,
+	skein_update,	/* using regular digest update is OK here */
+	skein_final,	/* using regular digest final is OK here */
+	skein_mac_atomic,
+	NULL
+};
+
+static int skein_create_ctx_template(crypto_provider_handle_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+    size_t *, crypto_req_handle_t);
+static int skein_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t skein_ctx_ops = {
+	skein_create_ctx_template,
+	skein_free_context
+};
+
+static crypto_ops_t skein_crypto_ops = {{{{{
+	&skein_control_ops,
+	&skein_digest_ops,
+	NULL,
+	&skein_mac_ops,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&skein_ctx_ops,
+}}}}};
+
+static crypto_provider_info_t skein_prov_info = {{{{
+	CRYPTO_SPI_VERSION_1,
+	"Skein Software Provider",
+	CRYPTO_SW_PROVIDER,
+	NULL,
+	&skein_crypto_ops,
+	sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t),
+	skein_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t skein_prov_handle = 0;
+
+typedef struct skein_ctx {
+	skein_mech_type_t		sc_mech_type;
+	size_t				sc_digest_bitlen;
+	/*LINTED(E_ANONYMOUS_UNION_DECL)*/
+	union {
+		Skein_256_Ctxt_t	sc_256;
+		Skein_512_Ctxt_t	sc_512;
+		Skein1024_Ctxt_t	sc_1024;
+	};
+} skein_ctx_t;
+#define	SKEIN_CTX(_ctx_)	((skein_ctx_t *)((_ctx_)->cc_provider_private))
+#define	SKEIN_CTX_LVALUE(_ctx_)	(_ctx_)->cc_provider_private
+#define	SKEIN_OP(_skein_ctx, _op, ...)					\
+	do {								\
+		skein_ctx_t	*sc = (_skein_ctx);			\
+		switch (sc->sc_mech_type) {				\
+		case SKEIN_256_MECH_INFO_TYPE:				\
+		case SKEIN_256_MAC_MECH_INFO_TYPE:			\
+			(void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\
+			break;						\
+		case SKEIN_512_MECH_INFO_TYPE:				\
+		case SKEIN_512_MAC_MECH_INFO_TYPE:			\
+			(void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\
+			break;						\
+		case SKEIN1024_MECH_INFO_TYPE:				\
+		case SKEIN1024_MAC_MECH_INFO_TYPE:			\
+			(void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\
+			break;						\
+		}							\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+static int
+skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result)
+{
+	if (mechanism->cm_param != NULL) {
+		/*LINTED(E_BAD_PTR_CAST_ALIGN)*/
+		skein_param_t	*param = (skein_param_t *)mechanism->cm_param;
+
+		if (mechanism->cm_param_len != sizeof (*param) ||
+		    param->sp_digest_bitlen == 0) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		*result = param->sp_digest_bitlen;
+	} else {
+		switch (mechanism->cm_type) {
+		case SKEIN_256_MECH_INFO_TYPE:
+			*result = 256;
+			break;
+		case SKEIN_512_MECH_INFO_TYPE:
+			*result = 512;
+			break;
+		case SKEIN1024_MECH_INFO_TYPE:
+			*result = 1024;
+			break;
+		default:
+			return (CRYPTO_MECHANISM_INVALID);
+		}
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+int
+skein_mod_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	/*
+	 * Try to register with KCF - failure shouldn't unload us, since we
+	 * still may want to continue providing misc/skein functionality.
+	 */
+	(void) crypto_register_provider(&skein_prov_info, &skein_prov_handle);
+
+	return (0);
+}
+
+int
+skein_mod_fini(void) {
+	return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+skein_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+	*status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * General Skein hashing helper functions.
+ */
+
+/*
+ * Performs an Update on a context with uio input data.
+ */
+static int
+skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data)
+{
+	off_t		offset = data->cd_offset;
+	size_t		length = data->cd_length;
+	uint_t		vec_idx;
+	size_t		cur_len;
+	const uio_t	*uio = data->cd_uio;
+
+	/* we support only kernel buffer */
+	if (uio->uio_segflg != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * digested.
+	 */
+	for (vec_idx = 0; vec_idx < uio->uio_iovcnt &&
+	    offset >= uio->uio_iov[vec_idx].iov_len;
+	    offset -= uio->uio_iov[vec_idx++].iov_len)
+		;
+	if (vec_idx == uio->uio_iovcnt) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * Now do the digesting on the iovecs.
+	 */
+	while (vec_idx < uio->uio_iovcnt && length > 0) {
+		cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, length);
+		SKEIN_OP(ctx, Update, (uint8_t *)uio->uio_iov[vec_idx].iov_base
+		    + offset, cur_len);
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == uio->uio_iovcnt && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed, i.e.
+		 * The caller requested to digest more data than it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Performs a Final on a context and writes to a uio digest output.
+ */
+static int
+skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	off_t	offset = digest->cd_offset;
+	uint_t	vec_idx;
+	uio_t	*uio = digest->cd_uio;
+
+	/* we support only kernel buffer */
+	if (uio->uio_segflg != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing ptr to the digest to be returned.
+	 */
+	for (vec_idx = 0; offset >= uio->uio_iov[vec_idx].iov_len &&
+	    vec_idx < uio->uio_iovcnt;
+	    offset -= uio->uio_iov[vec_idx++].iov_len)
+		;
+	if (vec_idx == uio->uio_iovcnt) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+	if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <=
+	    uio->uio_iov[vec_idx].iov_len) {
+		/* The computed digest will fit in the current iovec. */
+		SKEIN_OP(ctx, Final,
+		    (uchar_t *)uio->uio_iov[vec_idx].iov_base + offset);
+	} else {
+		uint8_t *digest_tmp;
+		off_t scratch_offset = 0;
+		size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen);
+		size_t cur_len;
+
+		digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES(
+		    ctx->sc_digest_bitlen), crypto_kmflag(req));
+		if (digest_tmp == NULL)
+			return (CRYPTO_HOST_MEMORY);
+		SKEIN_OP(ctx, Final, digest_tmp);
+		while (vec_idx < uio->uio_iovcnt && length > 0) {
+			cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset,
+			    length);
+			bcopy(digest_tmp + scratch_offset,
+			    uio->uio_iov[vec_idx].iov_base + offset, cur_len);
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+		kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen));
+
+		if (vec_idx == uio->uio_iovcnt && length > 0) {
+			/*
+			 * The end of the specified iovec's was reached but
+			 * the length requested could not be processed, i.e.
+			 * The caller requested to digest more data than it
+			 * provided.
+			 */
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+/*
+ * Initializes a skein digest context to the configuration in `mechanism'.
+ * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param
+ * field may contain a skein_param_t structure indicating the length of the
+ * digest the algorithm should produce. Otherwise the default output lengths
+ * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes
+ * for Skein-1024).
+ */
+static int
+skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_req_handle_t req)
+{
+	int	error = CRYPTO_SUCCESS;
+
+	if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+
+	SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+	    crypto_kmflag(req));
+	if (SKEIN_CTX(ctx) == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism,
+	    &SKEIN_CTX(ctx)->sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		goto errout;
+	SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen);
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	SKEIN_CTX_LVALUE(ctx) = NULL;
+	return (error);
+}
+
+/*
+ * Executes a skein_update and skein_digest on a pre-initialized crypto
+ * context in a single step. See the documentation to these functions to
+ * see what to pass here.
+ */
+static int
+skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	if (digest->cd_length <
+	    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	error = skein_update(ctx, data, req);
+	if (error != CRYPTO_SUCCESS) {
+		bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		SKEIN_CTX_LVALUE(ctx) = NULL;
+		digest->cd_length = 0;
+		return (error);
+	}
+	error = skein_final(ctx, digest, req);
+
+	return (error);
+}
+
+/*
+ * Performs a skein Update with the input message in `data' (successive calls
+ * can push more data). This is used both for digest and MAC operation.
+ * Supported input data formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SKEIN_OP(SKEIN_CTX(ctx), Update,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		error = skein_digest_update_uio(SKEIN_CTX(ctx), data);
+		break;
+	default:
+		error = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (error);
+}
+
+/*
+ * Performs a skein Final, writing the output to `digest'. This is used both
+ * for digest and MAC operation.
+ * Supported output digest formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	if (digest->cd_length <
+	    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SKEIN_OP(SKEIN_CTX(ctx), Final,
+		    (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset);
+		break;
+	case CRYPTO_DATA_UIO:
+		error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req);
+		break;
+	default:
+		error = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (error == CRYPTO_SUCCESS)
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+	else
+		digest->cd_length = 0;
+
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx))));
+	SKEIN_CTX_LVALUE(ctx) = NULL;
+
+	return (error);
+}
+
+/*
+ * Performs a full skein digest computation in a single call, configuring the
+ * algorithm according to `mechanism', reading the input to be digested from
+ * `data' and writing the output to `digest'.
+ * Supported input/output formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_digest_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int		error;
+	skein_ctx_t	skein_ctx;
+	crypto_ctx_t	ctx;
+	SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+	/* Init */
+	if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+	skein_ctx.sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		goto out;
+	SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen);
+
+	if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS)
+		goto out;
+	if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS)
+		goto out;
+
+out:
+	if (error == CRYPTO_SUCCESS)
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen);
+	else
+		digest->cd_length = 0;
+	bzero(&skein_ctx, sizeof (skein_ctx));
+
+	return (error);
+}
+
+/*
+ * Helper function that builds a Skein MAC context from the provided
+ * mechanism and key.
+ */
+static int
+skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key)
+{
+	int error;
+
+	if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+	ctx->sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		return (error);
+	SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data,
+	    CRYPTO_BITS2BYTES(key->ck_length));
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provide mac entry points.
+ */
+/*
+ * Initializes a skein MAC context. You may pass a ctx_template, in which
+ * case the template will be reused to make initialization more efficient.
+ * Otherwise a new context will be constructed. The mechanism cm_type must
+ * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you
+ * may pass a skein_param_t in cm_param to configure the length of the
+ * digest. The key must be in raw format.
+ */
+static int
+skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+    crypto_req_handle_t req)
+{
+	int	error;
+
+	SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+	    crypto_kmflag(req));
+	if (SKEIN_CTX(ctx) == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	if (ctx_template != NULL) {
+		bcopy(ctx_template, SKEIN_CTX(ctx),
+		    sizeof (*SKEIN_CTX(ctx)));
+	} else {
+		error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key);
+		if (error != CRYPTO_SUCCESS)
+			goto errout;
+	}
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	return (error);
+}
+
+/*
+ * The MAC update and final calls are reused from the regular digest code.
+ */
+
+/*ARGSUSED*/
+/*
+ * Same as skein_digest_atomic, performs an atomic Skein MAC operation in
+ * one step. All the same properties apply to the arguments of this
+ * function as to those of the partial operations above.
+ */
+static int
+skein_mac_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	/* faux crypto context just for skein_digest_{update,final} */
+	int		error;
+	crypto_ctx_t	ctx;
+	skein_ctx_t	skein_ctx;
+	SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+	if (ctx_template != NULL) {
+		bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx));
+	} else {
+		error = skein_mac_ctx_build(&skein_ctx, mechanism, key);
+		if (error != CRYPTO_SUCCESS)
+			goto errout;
+	}
+
+	if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS)
+		goto errout;
+	if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS)
+		goto errout;
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(&skein_ctx, sizeof (skein_ctx));
+	return (error);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/*
+ * Constructs a context template for the Skein MAC algorithm. The same
+ * properties apply to the arguments of this function as to those of
+ * skein_mac_init.
+ */
+/*ARGSUSED*/
+static int
+skein_create_ctx_template(crypto_provider_handle_t provider,
+    crypto_mechanism_t *mechanism, crypto_key_t *key,
+    crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+    crypto_req_handle_t req)
+{
+	int		error;
+	skein_ctx_t	*ctx_tmpl;
+
+	ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req));
+	if (ctx_tmpl == NULL)
+		return (CRYPTO_HOST_MEMORY);
+	error = skein_mac_ctx_build(ctx_tmpl, mechanism, key);
+	if (error != CRYPTO_SUCCESS)
+		goto errout;
+	*ctx_template = ctx_tmpl;
+	*ctx_template_size = sizeof (*ctx_tmpl);
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(ctx_tmpl, sizeof (*ctx_tmpl));
+	kmem_free(ctx_tmpl, sizeof (*ctx_tmpl));
+	return (error);
+}
+
+/*
+ * Frees a skein context in a parent crypto context.
+ */
+static int
+skein_free_context(crypto_ctx_t *ctx)
+{
+	if (SKEIN_CTX(ctx) != NULL) {
+		bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		SKEIN_CTX_LVALUE(ctx) = NULL;
+	}
+
+	return (CRYPTO_SUCCESS);
+}
author	Tony Hutter <[email protected]>	2016-06-15 15:47:05 -0700
committer	Tony Hutter <[email protected]>	2016-10-03 14:51:15 -0700
commit	3c67d83a8afb391f20bc53d36a0cebea6897b3e2 (patch)
tree	2b862986c83414c7359c00219b43ad47dd73f81e /module/icp
parent	62a65a654e15a1388bfb571727e69b46e7cc07ab (diff)