78 files changed, 8992 insertions, 199 deletions
diff --git a/Makefile.am b/Makefile.am
index fe4285b3f..abc98e4ed 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -6,7 +6,7 @@ include config/tgz.am
 
 SUBDIRS = include rpm
 if CONFIG_USER
-SUBDIRS += udev etc man scripts tests lib cmd contrib
+SUBDIRS += udev etc man scripts lib tests cmd contrib
 endif
 if CONFIG_KERNEL
 SUBDIRS += module
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
index 1b77b6cee..912a7f70e 100644
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -5654,16 +5654,16 @@ ztest_fletcher(ztest_ds_t *zd, uint64_t id)
 			*ptr = ztest_random(UINT_MAX);
 
 		VERIFY0(fletcher_4_impl_set("scalar"));
-		fletcher_4_native(buf, size, &zc_ref);
-		fletcher_4_byteswap(buf, size, &zc_ref_byteswap);
+		fletcher_4_native(buf, size, NULL, &zc_ref);
+		fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap);
 
 		VERIFY0(fletcher_4_impl_set("cycle"));
 		while (run_count-- > 0) {
 			zio_cksum_t zc;
 			zio_cksum_t zc_byteswap;
 
-			fletcher_4_byteswap(buf, size, &zc_byteswap);
-			fletcher_4_native(buf, size, &zc);
+			fletcher_4_byteswap(buf, size, NULL, &zc_byteswap);
+			fletcher_4_native(buf, size, NULL, &zc);
 
 			VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc)));
 			VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap,
diff --git a/configure.ac b/configure.ac
index c7685550d..edcf29958 100644
--- a/configure.ac
+++ b/configure.ac
@@ -49,6 +49,7 @@ AC_PROG_INSTALL
 AC_PROG_CC
 AC_PROG_LIBTOOL
 AM_PROG_AS
+AM_PROG_CC_C_O
 
 ZFS_AC_LICENSE
 ZFS_AC_PACKAGE
@@ -178,6 +179,7 @@ AC_CONFIG_FILES([
 	tests/zfs-tests/tests/functional/cache/Makefile
 	tests/zfs-tests/tests/functional/cachefile/Makefile
 	tests/zfs-tests/tests/functional/casenorm/Makefile
+	tests/zfs-tests/tests/functional/checksum/Makefile
 	tests/zfs-tests/tests/functional/clean_mirror/Makefile
 	tests/zfs-tests/tests/functional/cli_root/Makefile
 	tests/zfs-tests/tests/functional/cli_root/zdb/Makefile
diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am
index 40cd0597c..96d77c7b3 100644
--- a/include/sys/Makefile.am
+++ b/include/sys/Makefile.am
@@ -31,6 +31,7 @@ COMMON_H = \
 	$(top_srcdir)/include/sys/dsl_scan.h \
 	$(top_srcdir)/include/sys/dsl_synctask.h \
 	$(top_srcdir)/include/sys/dsl_userhold.h \
+	$(top_srcdir)/include/sys/edonr.h \
 	$(top_srcdir)/include/sys/efi_partition.h \
 	$(top_srcdir)/include/sys/metaslab.h \
 	$(top_srcdir)/include/sys/metaslab_impl.h \
@@ -46,6 +47,8 @@ COMMON_H = \
 	$(top_srcdir)/include/sys/sa.h \
 	$(top_srcdir)/include/sys/sa_impl.h \
 	$(top_srcdir)/include/sys/sdt.h \
+	$(top_srcdir)/include/sys/sha2.h \
+	$(top_srcdir)/include/sys/skein.h \
 	$(top_srcdir)/include/sys/spa_boot.h \
 	$(top_srcdir)/include/sys/space_map.h \
 	$(top_srcdir)/include/sys/space_reftree.h \
diff --git a/include/sys/crypto/icp.h b/include/sys/crypto/icp.h
index c7bb78e83..d8948e022 100644
--- a/include/sys/crypto/icp.h
+++ b/include/sys/crypto/icp.h
@@ -29,12 +29,18 @@
 int aes_mod_init(void);
 int aes_mod_fini(void);
 
+int edonr_mod_init(void);
+int edonr_mod_fini(void);
+
 int sha1_mod_init(void);
 int sha1_mod_fini(void);
 
 int sha2_mod_init(void);
 int sha2_mod_fini(void);
 
+int skein_mod_init(void);
+int skein_mod_fini(void);
+
 int icp_init(void);
 void icp_fini(void);
 
diff --git a/include/sys/dmu.h b/include/sys/dmu.h
index 4efab7c72..b67acb52c 100644
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -25,6 +25,7 @@
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -317,6 +318,7 @@ typedef struct dmu_buf {
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
+#define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 #define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 
 /*
diff --git a/include/sys/edonr.h b/include/sys/edonr.h
new file mode 100644
index 000000000..79b7cd8c7
--- /dev/null
+++ b/include/sys/edonr.h
@@ -0,0 +1,98 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]>
+ *
+ * Tweaked Edon-R implementation for SUPERCOP, based on NIST API.
+ *
+ * $Id: edonr.h 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#ifndef	_SYS_EDONR_H_
+#define	_SYS_EDONR_H_
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+#ifdef  _KERNEL
+#include <sys/types.h>
+#else
+#include <stdint.h> /* uint32_t... */
+#include <stdlib.h> /* size_t ... */
+#endif
+
+/*
+ * EdonR allows to call EdonRUpdate() consecutively only if the total length
+ * of stored unprocessed data and the new supplied data is less than or equal
+ * to the BLOCK_SIZE on which the compression functions operates.
+ * Otherwise an assertion failure is invoked.
+ */
+
+/* Specific algorithm definitions */
+#define	EdonR224_DIGEST_SIZE	28
+#define	EdonR224_BLOCK_SIZE	64
+#define	EdonR256_DIGEST_SIZE	32
+#define	EdonR256_BLOCK_SIZE	64
+#define	EdonR384_DIGEST_SIZE	48
+#define	EdonR384_BLOCK_SIZE	128
+#define	EdonR512_DIGEST_SIZE	64
+#define	EdonR512_BLOCK_SIZE	128
+
+#define	EdonR256_BLOCK_BITSIZE	512
+#define	EdonR512_BLOCK_BITSIZE	1024
+
+typedef struct {
+	uint32_t DoublePipe[16];
+	uint8_t LastPart[EdonR256_BLOCK_SIZE * 2];
+} EdonRData256;
+typedef struct {
+	uint64_t DoublePipe[16];
+	uint8_t LastPart[EdonR512_BLOCK_SIZE * 2];
+} EdonRData512;
+
+typedef struct {
+	size_t hashbitlen;
+
+	/* + algorithm specific parameters */
+	int unprocessed_bits;
+	uint64_t bits_processed;
+	union {
+		EdonRData256 p256[1];
+		EdonRData512 p512[1];
+	} pipe[1];
+} EdonRState;
+
+void EdonRInit(EdonRState *state, size_t hashbitlen);
+void EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen);
+void EdonRFinal(EdonRState *state, uint8_t *hashval);
+void EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen,
+    uint8_t *hashval);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_EDONR_H_ */
diff --git a/module/icp/include/sha2/sha2.h b/include/sys/sha2.h
index 8e53987a7..9039835f1 100644
--- a/module/icp/include/sha2/sha2.h
+++ b/include/sys/sha2.h
@@ -27,7 +27,11 @@
 #ifndef _SYS_SHA2_H
 #define	_SYS_SHA2_H
 
+#ifdef  _KERNEL
 #include <sys/types.h>		/* for uint_* */
+#else
+#include <stdint.h>
+#endif
 
 #ifdef	__cplusplus
 extern "C" {
@@ -37,12 +41,27 @@ extern "C" {
 #define	SHA2_HMAC_MAX_KEY_LEN	INT_MAX	/* SHA2-HMAC max key length in bytes */
 
 #define	SHA256_DIGEST_LENGTH	32	/* SHA256 digest length in bytes */
+#define	SHA384_DIGEST_LENGTH	48	/* SHA384 digest length in bytes */
+#define	SHA512_DIGEST_LENGTH	64	/* SHA512 digest length in bytes */
+
+/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */
+#define	SHA512_224_DIGEST_LENGTH	28	/* SHA512/224 digest length */
+#define	SHA512_256_DIGEST_LENGTH	32	/* SHA512/256 digest length */
 
 #define	SHA256_HMAC_BLOCK_SIZE	64	/* SHA256-HMAC block size */
+#define	SHA512_HMAC_BLOCK_SIZE	128	/* SHA512-HMAC block size */
 
 #define	SHA256			0
 #define	SHA256_HMAC		1
 #define	SHA256_HMAC_GEN		2
+#define	SHA384			3
+#define	SHA384_HMAC		4
+#define	SHA384_HMAC_GEN		5
+#define	SHA512			6
+#define	SHA512_HMAC		7
+#define	SHA512_HMAC_GEN		8
+#define	SHA512_224		9
+#define	SHA512_256		10
 
 /*
  * SHA2 context.
@@ -87,6 +106,18 @@ extern void SHA256Update(SHA256_CTX *, const void *, size_t);
 
 extern void SHA256Final(void *, SHA256_CTX *);
 
+extern void SHA384Init(SHA384_CTX *);
+
+extern void SHA384Update(SHA384_CTX *, const void *, size_t);
+
+extern void SHA384Final(void *, SHA384_CTX *);
+
+extern void SHA512Init(SHA512_CTX *);
+
+extern void SHA512Update(SHA512_CTX *, const void *, size_t);
+
+extern void SHA512Final(void *, SHA512_CTX *);
+
 #ifdef _SHA2_IMPL
 /*
  * The following types/functions are all private to the implementation
@@ -105,6 +136,14 @@ typedef enum sha2_mech_type {
 	SHA256_MECH_INFO_TYPE,		/* SUN_CKM_SHA256 */
 	SHA256_HMAC_MECH_INFO_TYPE,	/* SUN_CKM_SHA256_HMAC */
 	SHA256_HMAC_GEN_MECH_INFO_TYPE,	/* SUN_CKM_SHA256_HMAC_GENERAL */
+	SHA384_MECH_INFO_TYPE,		/* SUN_CKM_SHA384 */
+	SHA384_HMAC_MECH_INFO_TYPE,	/* SUN_CKM_SHA384_HMAC */
+	SHA384_HMAC_GEN_MECH_INFO_TYPE,	/* SUN_CKM_SHA384_HMAC_GENERAL */
+	SHA512_MECH_INFO_TYPE,		/* SUN_CKM_SHA512 */
+	SHA512_HMAC_MECH_INFO_TYPE,	/* SUN_CKM_SHA512_HMAC */
+	SHA512_HMAC_GEN_MECH_INFO_TYPE,	/* SUN_CKM_SHA512_HMAC_GENERAL */
+	SHA512_224_MECH_INFO_TYPE,	/* SUN_CKM_SHA512_224 */
+	SHA512_256_MECH_INFO_TYPE	/* SUN_CKM_SHA512_256 */
 } sha2_mech_type_t;
 
 #endif /* _SHA2_IMPL */
diff --git a/include/sys/skein.h b/include/sys/skein.h
new file mode 100644
index 000000000..2f649d6b2
--- /dev/null
+++ b/include/sys/skein.h
@@ -0,0 +1,183 @@
+/*
+ * Interface declarations for Skein hashing.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ *
+ * The following compile-time switches may be defined to control some
+ * tradeoffs between speed, code size, error checking, and security.
+ *
+ * The "default" note explains what happens when the switch is not defined.
+ *
+ *  SKEIN_DEBUG            -- make callouts from inside Skein code
+ *                            to examine/display intermediate values.
+ *                            [default: no callouts (no overhead)]
+ *
+ *  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+ *                            code. If not defined, most error checking
+ *                            is disabled (for performance). Otherwise,
+ *                            the switch value is interpreted as:
+ *                                0: use assert()      to flag errors
+ *                                1: return SKEIN_FAIL to flag errors
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+#ifndef	_SYS_SKEIN_H_
+#define	_SYS_SKEIN_H_
+
+#ifdef  _KERNEL
+#include <sys/types.h>		/* get size_t definition */
+#else
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+enum {
+	SKEIN_SUCCESS = 0,	/* return codes from Skein calls */
+	SKEIN_FAIL = 1,
+	SKEIN_BAD_HASHLEN = 2
+};
+
+#define	SKEIN_MODIFIER_WORDS	(2)	/* number of modifier (tweak) words */
+
+#define	SKEIN_256_STATE_WORDS	(4)
+#define	SKEIN_512_STATE_WORDS	(8)
+#define	SKEIN1024_STATE_WORDS	(16)
+#define	SKEIN_MAX_STATE_WORDS	(16)
+
+#define	SKEIN_256_STATE_BYTES	(8 * SKEIN_256_STATE_WORDS)
+#define	SKEIN_512_STATE_BYTES	(8 * SKEIN_512_STATE_WORDS)
+#define	SKEIN1024_STATE_BYTES	(8 * SKEIN1024_STATE_WORDS)
+
+#define	SKEIN_256_STATE_BITS	(64 * SKEIN_256_STATE_WORDS)
+#define	SKEIN_512_STATE_BITS	(64 * SKEIN_512_STATE_WORDS)
+#define	SKEIN1024_STATE_BITS	(64 * SKEIN1024_STATE_WORDS)
+
+#define	SKEIN_256_BLOCK_BYTES	(8 * SKEIN_256_STATE_WORDS)
+#define	SKEIN_512_BLOCK_BYTES	(8 * SKEIN_512_STATE_WORDS)
+#define	SKEIN1024_BLOCK_BYTES	(8 * SKEIN1024_STATE_WORDS)
+
+typedef struct {
+	size_t hashBitLen;	/* size of hash result, in bits */
+	size_t bCnt;		/* current byte count in buffer b[] */
+	/* tweak words: T[0]=byte cnt, T[1]=flags */
+	uint64_t T[SKEIN_MODIFIER_WORDS];
+} Skein_Ctxt_Hdr_t;
+
+typedef struct {		/*  256-bit Skein hash context structure */
+	Skein_Ctxt_Hdr_t h;	/* common header context variables */
+	uint64_t X[SKEIN_256_STATE_WORDS];	/* chaining variables */
+	/* partial block buffer (8-byte aligned) */
+	uint8_t b[SKEIN_256_BLOCK_BYTES];
+} Skein_256_Ctxt_t;
+
+typedef struct {		/*  512-bit Skein hash context structure */
+	Skein_Ctxt_Hdr_t h;	/* common header context variables */
+	uint64_t X[SKEIN_512_STATE_WORDS];	/* chaining variables */
+	/* partial block buffer (8-byte aligned) */
+	uint8_t b[SKEIN_512_BLOCK_BYTES];
+} Skein_512_Ctxt_t;
+
+typedef struct {		/* 1024-bit Skein hash context structure */
+	Skein_Ctxt_Hdr_t h;	/* common header context variables */
+	uint64_t X[SKEIN1024_STATE_WORDS];	/* chaining variables */
+	/* partial block buffer (8-byte aligned) */
+	uint8_t b[SKEIN1024_BLOCK_BYTES];
+} Skein1024_Ctxt_t;
+
+/*   Skein APIs for (incremental) "straight hashing" */
+int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen);
+int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen);
+int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen);
+
+int Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg,
+    size_t msgByteCnt);
+int Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg,
+    size_t msgByteCnt);
+int Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg,
+    size_t msgByteCnt);
+
+int Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal);
+int Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal);
+int Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal);
+
+/*
+ * Skein APIs for "extended" initialization: MAC keys, tree hashing.
+ * After an InitExt() call, just use Update/Final calls as with Init().
+ *
+ * Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes.
+ *          When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL,
+ *              the results of InitExt() are identical to calling Init().
+ *          The function Init() may be called once to "precompute" the IV for
+ *              a given hashBitLen value, then by saving a copy of the context
+ *              the IV computation may be avoided in later calls.
+ *          Similarly, the function InitExt() may be called once per MAC key
+ *              to precompute the MAC IV, then a copy of the context saved and
+ *              reused for each new MAC computation.
+ */
+int Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen,
+    uint64_t treeInfo, const uint8_t *key, size_t keyBytes);
+int Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen,
+    uint64_t treeInfo, const uint8_t *key, size_t keyBytes);
+int Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen,
+    uint64_t treeInfo, const uint8_t *key, size_t keyBytes);
+
+/*
+ * Skein APIs for MAC and tree hash:
+ *	Final_Pad: pad, do final block, but no OUTPUT type
+ *	Output:    do just the output stage
+ */
+int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal);
+int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal);
+int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal);
+
+#ifndef	SKEIN_TREE_HASH
+#define	SKEIN_TREE_HASH (1)
+#endif
+#if	SKEIN_TREE_HASH
+int Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal);
+int Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal);
+int Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal);
+#endif
+
+/*
+ * When you initialize a Skein KCF hashing method you can pass this param
+ * structure in cm_param to fine-tune the algorithm's defaults.
+ */
+typedef struct skein_param {
+	size_t	sp_digest_bitlen;		/* length of digest in bits */
+} skein_param_t;
+
+/* Module definitions */
+#ifdef	SKEIN_MODULE_IMPL
+#define	CKM_SKEIN_256				"CKM_SKEIN_256"
+#define	CKM_SKEIN_512				"CKM_SKEIN_512"
+#define	CKM_SKEIN1024				"CKM_SKEIN1024"
+#define	CKM_SKEIN_256_MAC			"CKM_SKEIN_256_MAC"
+#define	CKM_SKEIN_512_MAC			"CKM_SKEIN_512_MAC"
+#define	CKM_SKEIN1024_MAC			"CKM_SKEIN1024_MAC"
+
+typedef enum skein_mech_type {
+	SKEIN_256_MECH_INFO_TYPE,
+	SKEIN_512_MECH_INFO_TYPE,
+	SKEIN1024_MECH_INFO_TYPE,
+	SKEIN_256_MAC_MECH_INFO_TYPE,
+	SKEIN_512_MAC_MECH_INFO_TYPE,
+	SKEIN1024_MAC_MECH_INFO_TYPE
+} skein_mech_type_t;
+
+#define	VALID_SKEIN_DIGEST_MECH(__mech)				\
+	((int)(__mech) >= SKEIN_256_MECH_INFO_TYPE &&		\
+	(__mech) <= SKEIN1024_MECH_INFO_TYPE)
+#define	VALID_SKEIN_MAC_MECH(__mech)				\
+	((int)(__mech) >= SKEIN_256_MAC_MECH_INFO_TYPE &&	\
+	(__mech) <= SKEIN1024_MAC_MECH_INFO_TYPE)
+#endif	/* SKEIN_MODULE_IMPL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_SKEIN_H_ */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 0c71cca68..3d0b962e6 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -148,6 +149,14 @@ typedef struct dva {
 
 
 /*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+	uint8_t		zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h
index 59cb44de2..7b9e1ee0c 100644
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  */
 
@@ -172,6 +173,10 @@ struct spa {
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
 	bpobj_t		spa_deferred_bpobj;	/* deferred-free bplist */
 	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+	zio_cksum_salt_t spa_cksum_salt;	/* secret salt for cksum */
+	/* checksum context templates */
+	kmutex_t	spa_cksum_tmpls_lock;
+	void		*spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
 	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
diff --git a/include/sys/zio.h b/include/sys/zio.h
index 51b51fbec..22001559c 100644
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -80,6 +80,10 @@ enum zio_checksum {
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
 	ZIO_CHECKSUM_ZILOG2,
+	ZIO_CHECKSUM_NOPARITY,
+	ZIO_CHECKSUM_SHA512,
+	ZIO_CHECKSUM_SKEIN,
+	ZIO_CHECKSUM_EDONR,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h
index 04573ba54..b4c2c8c08 100644
--- a/include/sys/zio_checksum.h
+++ b/include/sys/zio_checksum.h
@@ -20,13 +20,15 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright Saso Kiselkov 2013, All rights reserved.
  */
 
 #ifndef _SYS_ZIO_CHECKSUM_H
 #define	_SYS_ZIO_CHECKSUM_H
 
 #include <sys/zio.h>
+#include <zfeature_common.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -35,17 +37,36 @@ extern "C" {
 /*
  * Signature for checksum functions.
  */
-typedef void zio_checksum_func_t(const void *, uint64_t, zio_cksum_t *);
+typedef void zio_checksum_func_t(const void *, uint64_t, const void *,
+    zio_cksum_t *);
+typedef void zio_checksum_t(const void *data, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp);
+typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
+typedef void zio_checksum_tmpl_free_t(void *ctx_template);
+
+typedef enum zio_checksum_flags {
+	/* Strong enough for metadata? */
+	ZCHECKSUM_FLAG_METADATA = (1 << 1),
+	/* ZIO embedded checksum */
+	ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
+	/* Strong enough for dedup (without verification)? */
+	ZCHECKSUM_FLAG_DEDUP = (1 << 3),
+	/* Uses salt value */
+	ZCHECKSUM_FLAG_SALTED = (1 << 4),
+	/* Strong enough for nopwrite? */
+	ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
+} zio_checksum_flags_t;
 
 /*
  * Information about each checksum function.
  */
 typedef const struct zio_checksum_info {
-	zio_checksum_func_t *ci_func[2]; /* checksum function per byteorder */
-	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_eck;		/* uses zio embedded checksum? */
-	boolean_t	ci_dedup;	/* strong enough for dedup? */
-	char		*ci_name;	/* descriptive name */
+	/* checksum function for each byteorder */
+	zio_checksum_t			*ci_func[2];
+	zio_checksum_tmpl_init_t	*ci_tmpl_init;
+	zio_checksum_tmpl_free_t	*ci_tmpl_free;
+	zio_checksum_flags_t		ci_flags;
+	char				*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
 typedef struct zio_bad_cksum {
@@ -62,7 +83,21 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
 /*
  * Checksum routines.
  */
-extern zio_checksum_func_t zio_checksum_SHA256;
+extern zio_checksum_t zio_checksum_SHA256;
+extern zio_checksum_t zio_checksum_SHA512_native;
+extern zio_checksum_t zio_checksum_SHA512_byteswap;
+
+/* Skein */
+extern zio_checksum_t zio_checksum_skein_native;
+extern zio_checksum_t zio_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+
+/* Edon-R */
+extern zio_checksum_t zio_checksum_edonr_native;
+extern zio_checksum_t zio_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
 
 extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
     void *, uint64_t, uint64_t, zio_bad_cksum_t *);
@@ -72,6 +107,8 @@ extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
     void *, uint64_t, uint64_t, zio_bad_cksum_t *);
 extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
 extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern void zio_checksum_templates_free(spa_t *spa);
+extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
 
 #ifdef	__cplusplus
 }
diff --git a/include/zfeature_common.h b/include/zfeature_common.h
index 41cfdf807..f05480181 100644
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -51,6 +51,9 @@ typedef enum spa_feature {
 	SPA_FEATURE_FS_SS_LIMIT,
 	SPA_FEATURE_LARGE_BLOCKS,
 	SPA_FEATURE_LARGE_DNODE,
+	SPA_FEATURE_SHA512,
+	SPA_FEATURE_SKEIN,
+	SPA_FEATURE_EDONR,
 	SPA_FEATURES
 } spa_feature_t;
 
diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h
index f0cfbd573..83f92a096 100644
--- a/include/zfs_fletcher.h
+++ b/include/zfs_fletcher.h
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 
 #ifndef	_ZFS_FLETCHER_H
 #define	_ZFS_FLETCHER_H
@@ -45,11 +48,11 @@ extern "C" {
  * checksum method is added. This method will ignore last (size % 4) bytes of
  * the data buffer.
  */
-void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
-void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
+void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *);
 void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *);
-void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *);
 void fletcher_4_incremental_native(const void *, uint64_t,
     zio_cksum_t *);
 void fletcher_4_incremental_byteswap(const void *, uint64_t,
diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am
index 41457fd52..0852a583a 100644
--- a/lib/libicp/Makefile.am
+++ b/lib/libicp/Makefile.am
@@ -20,7 +20,8 @@ ASM_SOURCES_AS = \
 	asm-x86_64/aes/aes_intel.S \
 	asm-x86_64/modes/gcm_intel.S \
 	asm-x86_64/sha1/sha1-x86_64.S \
-	asm-x86_64/sha2/sha256_impl.S
+	asm-x86_64/sha2/sha256_impl.S \
+	asm-x86_64/sha2/sha512_impl.S
 endif
 
 if TARGET_ASM_I386
@@ -46,6 +47,7 @@ KERNEL_C = \
 	api/kcf_mac.c \
 	algs/aes/aes_impl.c \
 	algs/aes/aes_modes.c \
+	algs/edonr/edonr.c \
 	algs/modes/modes.c \
 	algs/modes/cbc.c \
 	algs/modes/gcm.c \
@@ -54,10 +56,15 @@ KERNEL_C = \
 	algs/modes/ecb.c \
 	algs/sha1/sha1.c \
 	algs/sha2/sha2.c \
+	algs/skein/skein.c \
+	algs/skein/skein_block.c \
+	algs/skein/skein_iv.c \
 	illumos-crypto.c \
 	io/aes.c \
+	io/edonr_mod.c \
 	io/sha1_mod.c \
 	io/sha2_mod.c \
+	io/skein_mod.c \
 	os/modhash.c \
 	os/modconf.c \
 	core/kcf_sched.c \
diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h
index 6226872e5..bd89ad94f 100644
--- a/lib/libspl/include/assert.h
+++ b/lib/libspl/include/assert.h
@@ -73,6 +73,14 @@ do {									\
 #undef assert
 #endif
 
+/* Compile time assert */
+#define	CTASSERT_GLOBAL(x)		_CTASSERT(x, __LINE__)
+#define	CTASSERT(x)			{ _CTASSERT(x, __LINE__); }
+#define	_CTASSERT(x, y)			__CTASSERT(x, y)
+#define	__CTASSERT(x, y)						\
+	typedef char __attribute__((unused))				\
+	__compile_time_assertion__ ## y[(x) ? 1 : -1]
+
 #ifdef NDEBUG
 #define	ASSERT3S(x, y, z)	((void)0)
 #define	ASSERT3U(x, y, z)	((void)0)
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c
index 6b09cb6da..5ecf96985 100755
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1477,6 +1477,12 @@ zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
 			    "property setting is not allowed on "
 			    "bootable datasets"));
 			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
+		} else if (prop == ZFS_PROP_CHECKSUM ||
+		    prop == ZFS_PROP_DEDUP) {
+			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "property setting is not allowed on "
+			    "root pools"));
+			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else {
 			(void) zfs_standard_error(hdl, err, errbuf);
 		}
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c
index e409899a2..f70e34107 100644
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -61,6 +61,7 @@
 #include <sys/zio_checksum.h>
 #include <sys/ddt.h>
 #include <sys/socket.h>
+#include <sys/sha2.h>
 
 /* in libzfs_dataset.c */
 extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
@@ -365,10 +366,11 @@ cksummer(void *arg)
 			if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
 			    zero_cksum) ||
 			    !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
+				SHA256_CTX ctx;
 				zio_cksum_t tmpsha256;
 
 				zio_checksum_SHA256(buf,
-				    payload_size, &tmpsha256);
+				    payload_size, &ctx, &tmpsha256);
 
 				drrw->drr_key.ddk_cksum.zc_word[0] =
 				    BE_64(tmpsha256.zc_word[0]);
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am
index 351ddfeac..c2f5a50b1 100644
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -61,6 +61,7 @@ KERNEL_C = \
 	dsl_synctask.c \
 	dsl_destroy.c \
 	dsl_userhold.c \
+	edonr_zfs.c \
 	fm.c \
 	gzip.c \
 	lzjb.c \
@@ -73,6 +74,7 @@ KERNEL_C = \
 	rrwlock.c \
 	sa.c \
 	sha256.c \
+	skein_zfs.c \
 	spa.c \
 	spa_boot.c \
 	spa_config.c \
diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5
index fa04d6e81..dcfb30d18 100644
--- a/man/man5/zpool-features.5
+++ b/man/man5/zpool-features.5
@@ -1,5 +1,5 @@
 '\" te
-.\" Copyright (c) 2013 by Delphix. All rights reserved.
+.\" Copyright (c) 2012, 2015 by Delphix. All rights reserved.
 .\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
 .\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
 .\" The contents of this file are subject to the terms of the Common Development
@@ -457,5 +457,111 @@ allow more data to be stored in the bonus buffer, thus potentially
 improving performance by avoiding the use of spill blocks.
 .RE
 
+\fB\fBsha512\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	org.illumos:sha512
+READ\-ONLY COMPATIBLE	no
+DEPENDENCIES	none
+.TE
+
+This feature enables the use of the SHA-512/256 truncated hash algorithm
+(FIPS 180-4) for checksum and dedup. The native 64-bit arithmetic of
+SHA-512 provides an approximate 50% performance boost over SHA-256 on
+64-bit hardware and is thus a good minimum-change replacement candidate
+for systems where hash performance is important, but these systems
+cannot for whatever reason utilize the faster \fBskein\fR and
+\fBedonr\fR algorithms.
+
+When the \fBsha512\fR feature is set to \fBenabled\fR, the administrator
+can turn on the \fBsha512\fR checksum on any dataset using the
+\fBzfs set checksum=sha512\fR(1M) command.  This feature becomes
+\fBactive\fR once a \fBchecksum\fR property has been set to \fBsha512\fR,
+and will return to being \fBenabled\fR once all filesystems that have
+ever had their checksum set to \fBsha512\fR are destroyed.
+
+Booting off of pools utilizing SHA-512/256 is supported (provided that
+the updated GRUB stage2 module is installed).
+
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBskein\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	org.illumos:skein
+READ\-ONLY COMPATIBLE	no
+DEPENDENCIES	none
+.TE
+
+This feature enables the use of the Skein hash algorithm for checksum
+and dedup. Skein is a high-performance secure hash algorithm that was a
+finalist in the NIST SHA-3 competition. It provides a very high security
+margin and high performance on 64-bit hardware (80% faster than
+SHA-256). This implementation also utilizes the new salted checksumming
+functionality in ZFS, which means that the checksum is pre-seeded with a
+secret 256-bit random key (stored on the pool) before being fed the data
+block to be checksummed. Thus the produced checksums are unique to a
+given pool, preventing hash collision attacks on systems with dedup.
+
+When the \fBskein\fR feature is set to \fBenabled\fR, the administrator
+can turn on the \fBskein\fR checksum on any dataset using the
+\fBzfs set checksum=skein\fR(1M) command.  This feature becomes
+\fBactive\fR once a \fBchecksum\fR property has been set to \fBskein\fR,
+and will return to being \fBenabled\fR once all filesystems that have
+ever had their checksum set to \fBskein\fR are destroyed.
+
+Booting off of pools using \fBskein\fR is \fBNOT\fR supported
+-- any attempt to enable \fBskein\fR on a root pool will fail with an
+error.
+
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fBedonr\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	org.illumos:edonr
+READ\-ONLY COMPATIBLE	no
+DEPENDENCIES	none
+.TE
+
+This feature enables the use of the Edon-R hash algorithm for checksum,
+including for nopwrite (if compression is also enabled, an overwrite of
+a block whose checksum matches the data being written will be ignored).
+In an abundance of caution, Edon-R can not be used with dedup
+(without verification).
+
+Edon-R is a very high-performance hash algorithm that was part
+of the NIST SHA-3 competition. It provides extremely high hash
+performance (over 350% faster than SHA-256), but was not selected
+because of its unsuitability as a general purpose secure hash algorithm.
+This implementation utilizes the new salted checksumming functionality
+in ZFS, which means that the checksum is pre-seeded with a secret
+256-bit random key (stored on the pool) before being fed the data block
+to be checksummed. Thus the produced checksums are unique to a given
+pool.
+
+When the \fBedonr\fR feature is set to \fBenabled\fR, the administrator
+can turn on the \fBedonr\fR checksum on any dataset using the
+\fBzfs set checksum=edonr\fR(1M) command.  This feature becomes
+\fBactive\fR once a \fBchecksum\fR property has been set to \fBedonr\fR,
+and will return to being \fBenabled\fR once all filesystems that have
+ever had their checksum set to \fBedonr\fR are destroyed.
+
+Booting off of pools using \fBedonr\fR is \fBNOT\fR supported
+-- any attempt to enable \fBedonr\fR on a root pool will fail with an
+error.
+
 .SH "SEE ALSO"
 \fBzpool\fR(8)
diff --git a/man/man8/zfs.8 b/man/man8/zfs.8
index e13fc1a52..e543ba51d 100644
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@@ -837,12 +837,23 @@ The values \fBon\fR and \fBnoauto\fR are equivalent to the \fBauto\fR and \fBnoa
 .sp
 .ne 2
 .na
-\fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2\fR | \fBfletcher4\fR | \fBsha256\fR\fR
+\fB\fBchecksum\fR=\fBon\fR | \fBoff\fR | \fBfletcher2\fR | \fBfletcher4\fR | \fBsha256\fR | \fBnoparity\fR | \fBsha512\fR | \fBskein\fR | \fBedonr\fR\fR
 .ad
 .sp .6
 .RS 4n
-Controls the checksum used to verify data integrity. The default value is \fBon\fR, which automatically selects an appropriate algorithm (currently, \fBfletcher4\fR, but this may change in future releases). The value \fBoff\fR disables integrity checking on user data. Disabling checksums is \fBNOT\fR a recommended practice.
+Controls the checksum used to verify data integrity. The default value is
+\fBon\fR, which automatically selects an appropriate algorithm (currently,
+\fBfletcher4\fR, but this may change in future releases). The value \fBoff\fR
+disables integrity checking on user data.  The value \fBnoparity\fR not only
+disables integrity but also disables maintaining parity for user data.
+This setting is used internally by a dump device residing on a RAID-Z pool and
+should not be used by any other dataset.  Disabling checksums is \fBNOT\fR a
+recommended practice.
 .sp
+The \fBsha512\fR, \fBskein\fR, and \fBedonr\fR checksum algorithms require
+enabling the appropriate features on the pool. Please see zpool-features for
+more information on these algorithms.
+
 Changing this property affects only newly-written data.
 .RE
 
diff --git a/module/icp/Makefile.in b/module/icp/Makefile.in
index 4be03dbae..b822635b7 100644
--- a/module/icp/Makefile.in
+++ b/module/icp/Makefile.in
@@ -12,6 +12,7 @@ ASM_SOURCES += asm-x86_64/aes/aes_intel.o
 ASM_SOURCES += asm-x86_64/modes/gcm_intel.o
 ASM_SOURCES += asm-x86_64/sha1/sha1-x86_64.o
 ASM_SOURCES += asm-x86_64/sha2/sha256_impl.o
+ASM_SOURCES += asm-x86_64/sha2/sha512_impl.o
 endif
 
 ifeq ($(TARGET_ASM_DIR), asm-i386)
@@ -43,8 +44,10 @@ $(MODULE)-objs += core/kcf_mech_tabs.o
 $(MODULE)-objs += core/kcf_prov_lib.o
 $(MODULE)-objs += spi/kcf_spi.o
 $(MODULE)-objs += io/aes.o
+$(MODULE)-objs += io/edonr_mod.o
 $(MODULE)-objs += io/sha1_mod.o
 $(MODULE)-objs += io/sha2_mod.o
+$(MODULE)-objs += io/skein_mod.o
 $(MODULE)-objs += os/modhash.o
 $(MODULE)-objs += os/modconf.o
 $(MODULE)-objs += algs/modes/cbc.o
@@ -55,8 +58,13 @@ $(MODULE)-objs += algs/modes/gcm.o
 $(MODULE)-objs += algs/modes/modes.o
 $(MODULE)-objs += algs/aes/aes_impl.o
 $(MODULE)-objs += algs/aes/aes_modes.o
+$(MODULE)-objs += algs/edonr/edonr.o
 $(MODULE)-objs += algs/sha1/sha1.o
 $(MODULE)-objs += algs/sha2/sha2.o
+$(MODULE)-objs += algs/sha1/sha1.o
+$(MODULE)-objs += algs/skein/skein.o
+$(MODULE)-objs += algs/skein/skein_block.o
+$(MODULE)-objs += algs/skein/skein_iv.o
 $(MODULE)-objs += $(ASM_SOURCES)
 
 ICP_DIRS = \
@@ -67,9 +75,11 @@ ICP_DIRS = \
 	os \
 	algs \
 	algs/aes \
+	algs/edonr \
 	algs/modes \
 	algs/sha1 \
 	algs/sha2 \
+	algs/skein \
 	asm-x86_64 \
 	asm-x86_64/aes \
 	asm-x86_64/modes \
diff --git a/module/icp/algs/edonr/edonr.c b/module/icp/algs/edonr/edonr.c
new file mode 100644
index 000000000..8ae989890
--- /dev/null
+++ b/module/icp/algs/edonr/edonr.c
@@ -0,0 +1,751 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]>
+ * Tweaked Edon-R implementation for SUPERCOP, based on NIST API.
+ *
+ * $Id: edonr.c 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+/* determine where we can get bcopy/bzero declarations */
+#ifdef	_KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+#include <sys/edonr.h>
+#include <sys/debug.h>
+
+/* big endian support, provides no-op's if run on little endian hosts */
+#include "edonr_byteorder.h"
+
+#define	hashState224(x)	((x)->pipe->p256)
+#define	hashState256(x)	((x)->pipe->p256)
+#define	hashState384(x)	((x)->pipe->p512)
+#define	hashState512(x)	((x)->pipe->p512)
+
+/* shift and rotate shortcuts */
+#define	shl(x, n)	((x) << n)
+#define	shr(x, n)	((x) >> n)
+
+#define	rotl32(x, n)	(((x) << (n)) | ((x) >> (32 - (n))))
+#define	rotr32(x, n)	(((x) >> (n)) | ((x) << (32 - (n))))
+
+#define	rotl64(x, n)	(((x) << (n)) | ((x) >> (64 - (n))))
+#define	rotr64(x, n)	(((x) >> (n)) | ((x) << (64 - (n))))
+
+#if !defined(__C99_RESTRICT)
+#define	restrict	/* restrict */
+#endif
+
+#define	EDONR_VALID_HASHBITLEN(x) \
+	((x) == 512 || (x) == 384 || (x) == 256 || (x) == 224)
+
+/* EdonR224 initial double chaining pipe */
+static const uint32_t i224p2[16] = {
+	0x00010203ul, 0x04050607ul, 0x08090a0bul, 0x0c0d0e0ful,
+	0x10111213ul, 0x14151617ul, 0x18191a1bul, 0x1c1d1e1ful,
+	0x20212223ul, 0x24252627ul, 0x28292a2bul, 0x2c2d2e2ful,
+	0x30313233ul, 0x34353637ul, 0x38393a3bul, 0x3c3d3e3ful,
+};
+
+/* EdonR256 initial double chaining pipe */
+static const uint32_t i256p2[16] = {
+	0x40414243ul, 0x44454647ul, 0x48494a4bul, 0x4c4d4e4ful,
+	0x50515253ul, 0x54555657ul, 0x58595a5bul, 0x5c5d5e5ful,
+	0x60616263ul, 0x64656667ul, 0x68696a6bul, 0x6c6d6e6ful,
+	0x70717273ul, 0x74757677ul, 0x78797a7bul, 0x7c7d7e7ful,
+};
+
+/* EdonR384 initial double chaining pipe */
+static const uint64_t i384p2[16] = {
+	0x0001020304050607ull, 0x08090a0b0c0d0e0full,
+	0x1011121314151617ull, 0x18191a1b1c1d1e1full,
+	0x2021222324252627ull, 0x28292a2b2c2d2e2full,
+	0x3031323334353637ull, 0x38393a3b3c3d3e3full,
+	0x4041424344454647ull, 0x48494a4b4c4d4e4full,
+	0x5051525354555657ull, 0x58595a5b5c5d5e5full,
+	0x6061626364656667ull, 0x68696a6b6c6d6e6full,
+	0x7071727374757677ull, 0x78797a7b7c7d7e7full
+};
+
+/* EdonR512 initial double chaining pipe */
+static const uint64_t i512p2[16] = {
+	0x8081828384858687ull, 0x88898a8b8c8d8e8full,
+	0x9091929394959697ull, 0x98999a9b9c9d9e9full,
+	0xa0a1a2a3a4a5a6a7ull, 0xa8a9aaabacadaeafull,
+	0xb0b1b2b3b4b5b6b7ull, 0xb8b9babbbcbdbebfull,
+	0xc0c1c2c3c4c5c6c7ull, 0xc8c9cacbcccdcecfull,
+	0xd0d1d2d3d4d5d6d7ull, 0xd8d9dadbdcdddedfull,
+	0xe0e1e2e3e4e5e6e7ull, 0xe8e9eaebecedeeefull,
+	0xf0f1f2f3f4f5f6f7ull, 0xf8f9fafbfcfdfeffull
+};
+
+/*
+ * First Latin Square
+ * 0   7   1   3   2   4   6   5
+ * 4   1   7   6   3   0   5   2
+ * 7   0   4   2   5   3   1   6
+ * 1   4   0   5   6   2   7   3
+ * 2   3   6   7   1   5   0   4
+ * 5   2   3   1   7   6   4   0
+ * 3   6   5   0   4   7   2   1
+ * 6   5   2   4   0   1   3   7
+ */
+#define	LS1_256(c, x0, x1, x2, x3, x4, x5, x6, x7)			\
+{									\
+	uint32_t x04, x17, x23, x56, x07, x26;				\
+	x04 = x0+x4, x17 = x1+x7, x07 = x04+x17;			\
+	s0 = c + x07 + x2;						\
+	s1 = rotl32(x07 + x3, 4);					\
+	s2 = rotl32(x07 + x6, 8);					\
+	x23 = x2 + x3;							\
+	s5 = rotl32(x04 + x23 + x5, 22);				\
+	x56 = x5 + x6;							\
+	s6 = rotl32(x17 + x56 + x0, 24);				\
+	x26 = x23+x56;							\
+	s3 = rotl32(x26 + x7, 13);					\
+	s4 = rotl32(x26 + x1, 17);					\
+	s7 = rotl32(x26 + x4, 29);					\
+}
+
+#define	LS1_512(c, x0, x1, x2, x3, x4, x5, x6, x7)			\
+{									\
+	uint64_t x04, x17, x23, x56, x07, x26;				\
+	x04 = x0+x4, x17 = x1+x7, x07 = x04+x17;			\
+	s0 = c + x07 + x2;						\
+	s1 = rotl64(x07 + x3, 5);					\
+	s2 = rotl64(x07 + x6, 15);					\
+	x23 = x2 + x3;							\
+	s5 = rotl64(x04 + x23 + x5, 40);				\
+	x56 = x5 + x6;							\
+	s6 = rotl64(x17 + x56 + x0, 50);				\
+	x26 = x23+x56;							\
+	s3 = rotl64(x26 + x7, 22);					\
+	s4 = rotl64(x26 + x1, 31);					\
+	s7 = rotl64(x26 + x4, 59);					\
+}
+
+/*
+ * Second Orthogonal Latin Square
+ * 0   4   2   3   1   6   5   7
+ * 7   6   3   2   5   4   1   0
+ * 5   3   1   6   0   2   7   4
+ * 1   0   5   4   3   7   2   6
+ * 2   1   0   7   4   5   6   3
+ * 3   5   7   0   6   1   4   2
+ * 4   7   6   1   2   0   3   5
+ * 6   2   4   5   7   3   0   1
+ */
+#define	LS2_256(c, y0, y1, y2, y3, y4, y5, y6, y7)			\
+{									\
+	uint32_t y01, y25, y34, y67, y04, y05, y27, y37;		\
+	y01 = y0+y1, y25 = y2+y5, y05 = y01+y25;			\
+	t0  = ~c + y05 + y7;						\
+	t2 = rotl32(y05 + y3, 9);					\
+	y34 = y3+y4, y04 = y01+y34;					\
+	t1 = rotl32(y04 + y6, 5);					\
+	t4 = rotl32(y04 + y5, 15);					\
+	y67 = y6+y7, y37 = y34+y67;					\
+	t3 = rotl32(y37 + y2, 11);					\
+	t7 = rotl32(y37 + y0, 27);					\
+	y27 = y25+y67;							\
+	t5 = rotl32(y27 + y4, 20);					\
+	t6 = rotl32(y27 + y1, 25);					\
+}
+
+#define	LS2_512(c, y0, y1, y2, y3, y4, y5, y6, y7)			\
+{									\
+	uint64_t y01, y25, y34, y67, y04, y05, y27, y37;		\
+	y01 = y0+y1, y25 = y2+y5, y05 = y01+y25;			\
+	t0  = ~c + y05 + y7;						\
+	t2 = rotl64(y05 + y3, 19);					\
+	y34 = y3+y4, y04 = y01+y34;					\
+	t1 = rotl64(y04 + y6, 10);					\
+	t4 = rotl64(y04 + y5, 36);					\
+	y67 = y6+y7, y37 = y34+y67;					\
+	t3 = rotl64(y37 + y2, 29);					\
+	t7 = rotl64(y37 + y0, 55);					\
+	y27 = y25+y67;							\
+	t5 = rotl64(y27 + y4, 44);					\
+	t6 = rotl64(y27 + y1, 48);					\
+}
+
+#define	quasi_exform256(r0, r1, r2, r3, r4, r5, r6, r7)			\
+{									\
+	uint32_t s04, s17, s23, s56, t01, t25, t34, t67;		\
+	s04 = s0 ^ s4, t01 = t0 ^ t1;					\
+	r0 = (s04 ^ s1) + (t01 ^ t5);					\
+	t67 = t6 ^ t7;							\
+	r1 = (s04 ^ s7) + (t2 ^ t67);					\
+	s23 = s2 ^ s3;							\
+	r7 = (s23 ^ s5) + (t4 ^ t67);					\
+	t34 = t3 ^ t4;							\
+	r3 = (s23 ^ s4) + (t0 ^ t34);					\
+	s56 = s5 ^ s6;							\
+	r5 = (s3 ^ s56) + (t34 ^ t6);					\
+	t25 = t2 ^ t5;							\
+	r6 = (s2 ^ s56) + (t25 ^ t7);					\
+	s17 = s1 ^ s7;							\
+	r4 = (s0 ^ s17) + (t1 ^ t25);					\
+	r2 = (s17 ^ s6) + (t01 ^ t3);					\
+}
+
+#define	quasi_exform512(r0, r1, r2, r3, r4, r5, r6, r7)			\
+{									\
+	uint64_t s04, s17, s23, s56, t01, t25, t34, t67;		\
+	s04 = s0 ^ s4, t01 = t0 ^ t1;					\
+	r0 = (s04 ^ s1) + (t01 ^ t5);					\
+	t67 = t6 ^ t7;							\
+	r1 = (s04 ^ s7) + (t2 ^ t67);					\
+	s23 = s2 ^ s3;							\
+	r7 = (s23 ^ s5) + (t4 ^ t67);					\
+	t34 = t3 ^ t4;							\
+	r3 = (s23 ^ s4) + (t0 ^ t34);					\
+	s56 = s5 ^ s6;							\
+	r5 = (s3 ^ s56) + (t34 ^ t6);					\
+	t25 = t2 ^ t5;							\
+	r6 = (s2 ^ s56) + (t25 ^ t7);					\
+	s17 = s1 ^ s7;							\
+	r4 = (s0 ^ s17) + (t1 ^ t25);					\
+	r2 = (s17 ^ s6) + (t01 ^ t3);					\
+}
+
+static size_t
+Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p)
+{
+	size_t bl;
+
+	for (bl = bitlen; bl >= EdonR256_BLOCK_BITSIZE;
+	    bl -= EdonR256_BLOCK_BITSIZE, data += 16) {
+		uint32_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+		    t5, t6, t7;
+		uint32_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+		    q5, q6, q7;
+		const uint32_t defix = 0xaaaaaaaa;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+		    swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define	d(j)	swp ## j
+#define	s32(j)	ld_swap32((uint32_t *)data + j, swp ## j)
+#else
+#define	d(j)	data[j]
+#endif
+
+		/* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s32(8);
+		s32(9);
+		s32(10);
+		s32(11);
+		s32(12);
+		s32(13);
+		s32(14);
+		s32(15);
+#endif
+		LS1_256(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+		    d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s32(0);
+		s32(1);
+		s32(2);
+		s32(3);
+		s32(4);
+		s32(5);
+		s32(6);
+		s32(7);
+#undef s32
+#endif
+		LS2_256(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+		    d(15));
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Second row of quasigroup e-transformations */
+		LS1_256(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+		    p[15]);
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Third row of quasigroup e-transformations */
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Fourth row of quasigroup e-transformations */
+		LS1_256(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+		LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Edon-R tweak on the original SHA-3 Edon-R submission. */
+		p[0] ^= d(8) ^ p0;
+		p[1] ^= d(9) ^ p1;
+		p[2] ^= d(10) ^ p2;
+		p[3] ^= d(11) ^ p3;
+		p[4] ^= d(12) ^ p4;
+		p[5] ^= d(13) ^ p5;
+		p[6] ^= d(14) ^ p6;
+		p[7] ^= d(15) ^ p7;
+		p[8] ^= d(0) ^ q0;
+		p[9] ^= d(1) ^ q1;
+		p[10] ^= d(2) ^ q2;
+		p[11] ^= d(3) ^ q3;
+		p[12] ^= d(4) ^ q4;
+		p[13] ^= d(5) ^ q5;
+		p[14] ^= d(6) ^ q6;
+		p[15] ^= d(7) ^ q7;
+	}
+
+#undef d
+	return (bitlen - bl);
+}
+
+/*
+ * Why is this #pragma here?
+ *
+ * Checksum functions like this one can go over the stack frame size check
+ * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024).  We can
+ * safely ignore the compiler error since we know that in ZoL, that
+ * the function will be called from a worker thread that won't be using
+ * much stack.  The only function that goes over the 1k limit is Q512(),
+ * which only goes over it by a hair (1248 bytes on ARM32).
+ */
+#include <sys/isa_defs.h>	/* for _ILP32 */
+#ifdef _ILP32   /* We're 32-bit, assume small stack frames */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+#endif
+
+#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__)
+static inline size_t
+#else
+static size_t
+#endif
+Q512(size_t bitlen, const uint64_t *data, uint64_t *restrict p)
+{
+	size_t bl;
+
+	for (bl = bitlen; bl >= EdonR512_BLOCK_BITSIZE;
+	    bl -= EdonR512_BLOCK_BITSIZE, data += 16) {
+		uint64_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4,
+		    t5, t6, t7;
+		uint64_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4,
+		    q5, q6, q7;
+		const uint64_t defix = 0xaaaaaaaaaaaaaaaaull;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8,
+		    swp9, swp10, swp11, swp12, swp13, swp14, swp15;
+#define	d(j)	swp##j
+#define	s64(j)	ld_swap64((uint64_t *)data+j, swp##j)
+#else
+#define	d(j)	data[j]
+#endif
+
+		/* First row of quasigroup e-transformations */
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s64(8);
+		s64(9);
+		s64(10);
+		s64(11);
+		s64(12);
+		s64(13);
+		s64(14);
+		s64(15);
+#endif
+		LS1_512(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9),
+		    d(8));
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		s64(0);
+		s64(1);
+		s64(2);
+		s64(3);
+		s64(4);
+		s64(5);
+		s64(6);
+		s64(7);
+#undef s64
+#endif
+		LS2_512(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7));
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14),
+		    d(15));
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Second row of quasigroup e-transformations */
+		LS1_512(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14],
+		    p[15]);
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Third row of quasigroup e-transformations */
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Fourth row of quasigroup e-transformations */
+		LS1_512(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0));
+		LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7);
+
+		LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7);
+		LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7);
+		quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7);
+
+		/* Edon-R tweak on the original SHA-3 Edon-R submission. */
+		p[0] ^= d(8) ^ p0;
+		p[1] ^= d(9) ^ p1;
+		p[2] ^= d(10) ^ p2;
+		p[3] ^= d(11) ^ p3;
+		p[4] ^= d(12) ^ p4;
+		p[5] ^= d(13) ^ p5;
+		p[6] ^= d(14) ^ p6;
+		p[7] ^= d(15) ^ p7;
+		p[8] ^= d(0) ^ q0;
+		p[9] ^= d(1) ^ q1;
+		p[10] ^= d(2) ^ q2;
+		p[11] ^= d(3) ^ q3;
+		p[12] ^= d(4) ^ q4;
+		p[13] ^= d(5) ^ q5;
+		p[14] ^= d(6) ^ q6;
+		p[15] ^= d(7) ^ q7;
+	}
+
+#undef d
+	return (bitlen - bl);
+}
+
+void
+EdonRInit(EdonRState *state, size_t hashbitlen)
+{
+	ASSERT(EDONR_VALID_HASHBITLEN(hashbitlen));
+	switch (hashbitlen) {
+	case 224:
+		state->hashbitlen = 224;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i224p2, hashState224(state)->DoublePipe,
+		    16 * sizeof (uint32_t));
+		break;
+
+	case 256:
+		state->hashbitlen = 256;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i256p2, hashState256(state)->DoublePipe,
+		    16 * sizeof (uint32_t));
+		break;
+
+	case 384:
+		state->hashbitlen = 384;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i384p2, hashState384(state)->DoublePipe,
+		    16 * sizeof (uint64_t));
+		break;
+
+	case 512:
+		state->hashbitlen = 512;
+		state->bits_processed = 0;
+		state->unprocessed_bits = 0;
+		bcopy(i512p2, hashState224(state)->DoublePipe,
+		    16 * sizeof (uint64_t));
+		break;
+	}
+}
+
+
+void
+EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen)
+{
+	uint32_t *data32;
+	uint64_t *data64;
+
+	size_t bits_processed;
+
+	ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+	switch (state->hashbitlen) {
+	case 224:
+	case 256:
+		if (state->unprocessed_bits > 0) {
+			/* LastBytes = databitlen / 8 */
+			int LastBytes = (int)databitlen >> 3;
+
+			ASSERT(state->unprocessed_bits + databitlen <=
+			    EdonR256_BLOCK_SIZE * 8);
+
+			bcopy(data, hashState256(state)->LastPart
+			    + (state->unprocessed_bits >> 3), LastBytes);
+			state->unprocessed_bits += (int)databitlen;
+			databitlen = state->unprocessed_bits;
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data32 = (uint32_t *)hashState256(state)->LastPart;
+		} else
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data32 = (uint32_t *)data;
+
+		bits_processed = Q256(databitlen, data32,
+		    hashState256(state)->DoublePipe);
+		state->bits_processed += bits_processed;
+		databitlen -= bits_processed;
+		state->unprocessed_bits = (int)databitlen;
+		if (databitlen > 0) {
+			/* LastBytes = Ceil(databitlen / 8) */
+			int LastBytes =
+			    ((~(((-(int)databitlen) >> 3) & 0x01ff)) +
+			    1) & 0x01ff;
+
+			data32 += bits_processed >> 5;	/* byte size update */
+			bcopy(data32, hashState256(state)->LastPart, LastBytes);
+		}
+		break;
+
+	case 384:
+	case 512:
+		if (state->unprocessed_bits > 0) {
+			/* LastBytes = databitlen / 8 */
+			int LastBytes = (int)databitlen >> 3;
+
+			ASSERT(state->unprocessed_bits + databitlen <=
+			    EdonR512_BLOCK_SIZE * 8);
+
+			bcopy(data, hashState512(state)->LastPart
+			    + (state->unprocessed_bits >> 3), LastBytes);
+			state->unprocessed_bits += (int)databitlen;
+			databitlen = state->unprocessed_bits;
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data64 = (uint64_t *)hashState512(state)->LastPart;
+		} else
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			data64 = (uint64_t *)data;
+
+		bits_processed = Q512(databitlen, data64,
+		    hashState512(state)->DoublePipe);
+		state->bits_processed += bits_processed;
+		databitlen -= bits_processed;
+		state->unprocessed_bits = (int)databitlen;
+		if (databitlen > 0) {
+			/* LastBytes = Ceil(databitlen / 8) */
+			int LastBytes =
+			    ((~(((-(int)databitlen) >> 3) & 0x03ff)) +
+			    1) & 0x03ff;
+
+			data64 += bits_processed >> 6;	/* byte size update */
+			bcopy(data64, hashState512(state)->LastPart, LastBytes);
+		}
+		break;
+	}
+}
+
+void
+EdonRFinal(EdonRState *state, uint8_t *hashval)
+{
+	uint32_t *data32;
+	uint64_t *data64, num_bits;
+
+	size_t databitlen;
+	int LastByte, PadOnePosition;
+
+	num_bits = state->bits_processed + state->unprocessed_bits;
+	ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen));
+	switch (state->hashbitlen) {
+	case 224:
+	case 256:
+		LastByte = (int)state->unprocessed_bits >> 3;
+		PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+		hashState256(state)->LastPart[LastByte] =
+		    (hashState256(state)->LastPart[LastByte]
+		    & (0xff << (PadOnePosition + 1))) ^
+		    (0x01 << PadOnePosition);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data64 = (uint64_t *)hashState256(state)->LastPart;
+
+		if (state->unprocessed_bits < 448) {
+			(void) memset((hashState256(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR256_BLOCK_SIZE - LastByte - 9);
+			databitlen = EdonR256_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 7);
+#else
+			data64[7] = num_bits;
+#endif
+		} else {
+			(void) memset((hashState256(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR256_BLOCK_SIZE * 2 - LastByte - 9);
+			databitlen = EdonR256_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 15);
+#else
+			data64[15] = num_bits;
+#endif
+		}
+
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data32 = (uint32_t *)hashState256(state)->LastPart;
+		state->bits_processed += Q256(databitlen, data32,
+		    hashState256(state)->DoublePipe);
+		break;
+
+	case 384:
+	case 512:
+		LastByte = (int)state->unprocessed_bits >> 3;
+		PadOnePosition = 7 - (state->unprocessed_bits & 0x07);
+		hashState512(state)->LastPart[LastByte] =
+		    (hashState512(state)->LastPart[LastByte]
+		    & (0xff << (PadOnePosition + 1))) ^
+		    (0x01 << PadOnePosition);
+		/* LINTED E_BAD_PTR_CAST_ALIGN */
+		data64 = (uint64_t *)hashState512(state)->LastPart;
+
+		if (state->unprocessed_bits < 960) {
+			(void) memset((hashState512(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR512_BLOCK_SIZE - LastByte - 9);
+			databitlen = EdonR512_BLOCK_SIZE * 8;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 15);
+#else
+			data64[15] = num_bits;
+#endif
+		} else {
+			(void) memset((hashState512(state)->LastPart) +
+			    LastByte + 1, 0x00,
+			    EdonR512_BLOCK_SIZE * 2 - LastByte - 9);
+			databitlen = EdonR512_BLOCK_SIZE * 16;
+#if defined(MACHINE_IS_BIG_ENDIAN)
+			st_swap64(num_bits, data64 + 31);
+#else
+			data64[31] = num_bits;
+#endif
+		}
+
+		state->bits_processed += Q512(databitlen, data64,
+		    hashState512(state)->DoublePipe);
+		break;
+	}
+
+	switch (state->hashbitlen) {
+	case 224: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t *d32 = (uint32_t *)hashval;
+		uint32_t *s32 = hashState224(state)->DoublePipe + 9;
+		int j;
+
+		for (j = 0; j < EdonR224_DIGEST_SIZE >> 2; j++)
+			st_swap32(s32[j], d32 + j);
+#else
+		bcopy(hashState256(state)->DoublePipe + 9, hashval,
+		    EdonR224_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 256: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint32_t *d32 = (uint32_t *)hashval;
+		uint32_t *s32 = hashState224(state)->DoublePipe + 8;
+		int j;
+
+		for (j = 0; j < EdonR256_DIGEST_SIZE >> 2; j++)
+			st_swap32(s32[j], d32 + j);
+#else
+		bcopy(hashState256(state)->DoublePipe + 8, hashval,
+		    EdonR256_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 384: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t *d64 = (uint64_t *)hashval;
+		uint64_t *s64 = hashState384(state)->DoublePipe + 10;
+		int j;
+
+		for (j = 0; j < EdonR384_DIGEST_SIZE >> 3; j++)
+			st_swap64(s64[j], d64 + j);
+#else
+		bcopy(hashState384(state)->DoublePipe + 10, hashval,
+		    EdonR384_DIGEST_SIZE);
+#endif
+		break;
+	}
+	case 512: {
+#if defined(MACHINE_IS_BIG_ENDIAN)
+		uint64_t *d64 = (uint64_t *)hashval;
+		uint64_t *s64 = hashState512(state)->DoublePipe + 8;
+		int j;
+
+		for (j = 0; j < EdonR512_DIGEST_SIZE >> 3; j++)
+			st_swap64(s64[j], d64 + j);
+#else
+		bcopy(hashState512(state)->DoublePipe + 8, hashval,
+		    EdonR512_DIGEST_SIZE);
+#endif
+		break;
+	}
+	}
+}
+
+
+void
+EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen,
+    uint8_t *hashval)
+{
+	EdonRState state;
+
+	EdonRInit(&state, hashbitlen);
+	EdonRUpdate(&state, data, databitlen);
+	EdonRFinal(&state, hashval);
+}
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(EdonRInit);
+EXPORT_SYMBOL(EdonRUpdate);
+EXPORT_SYMBOL(EdonRHash);
+EXPORT_SYMBOL(EdonRFinal);
+#endif
diff --git a/module/icp/algs/edonr/edonr_byteorder.h b/module/icp/algs/edonr/edonr_byteorder.h
new file mode 100644
index 000000000..d17e8f1fd
--- /dev/null
+++ b/module/icp/algs/edonr/edonr_byteorder.h
@@ -0,0 +1,216 @@
+/*
+ * IDI,NTNU
+ *
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ *
+ * Copyright (C) 2009, 2010, Jorn Amundsen <[email protected]>
+ *
+ * C header file to determine compile machine byte order. Take care when cross
+ * compiling.
+ *
+ * $Id: byteorder.h 517 2013-02-17 20:34:39Z joern $
+ */
+/*
+ * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved
+ */
+
+#ifndef _CRYPTO_EDONR_BYTEORDER_H
+#define	_CRYPTO_EDONR_BYTEORDER_H
+
+
+#include <sys/param.h>
+
+#if defined(__BYTE_ORDER)
+#if (__BYTE_ORDER == __BIG_ENDIAN)
+#define	MACHINE_IS_BIG_ENDIAN
+#elif (__BYTE_ORDER == __LITTLE_ENDIAN)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#elif defined(BYTE_ORDER)
+#if (BYTE_ORDER == BIG_ENDIAN)
+#define	MACHINE_IS_BIG_ENDIAN
+#elif (BYTE_ORDER == LITTLE_ENDIAN)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* __BYTE_ORDER || BYTE_ORDER */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#if defined(_BIG_ENDIAN) || defined(_MIPSEB)
+#define	MACHINE_IS_BIG_ENDIAN
+#endif
+#if defined(_LITTLE_ENDIAN) || defined(_MIPSEL)
+#define	MACHINE_IS_LITTLE_ENDIAN
+#endif
+#endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */
+
+#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN)
+#error unknown machine byte sex
+#endif
+
+#define	BYTEORDER_INCLUDED
+
+#if defined(MACHINE_IS_BIG_ENDIAN)
+/*
+ * Byte swapping macros for big endian architectures and compilers,
+ * add as appropriate for other architectures and/or compilers.
+ *
+ *     ld_swap64(src,dst) : uint64_t dst = *(src)
+ *     st_swap64(src,dst) : *(dst)       = uint64_t src
+ */
+
+#if defined(__PPC__) || defined(_ARCH_PPC)
+
+#if defined(__64BIT__)
+#if defined(_ARCH_PWR7)
+#define	aix_ld_swap64(s64, d64)\
+	__asm__("ldbrx %0,0,%1" : "=r"(d64) : "r"(s64))
+#define	aix_st_swap64(s64, d64)\
+	__asm__ volatile("stdbrx %1,0,%0" : : "r"(d64), "r"(s64))
+#else
+#define	aix_ld_swap64(s64, d64)						\
+{									\
+	uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */	\
+									\
+	__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\
+		: "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64));		\
+}
+
+#define	aix_st_swap64(s64, d64)						\
+{									\
+	uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */	\
+	h = (s64) >> 32;						\
+	__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0"	\
+		: "+r"(s4) : "r"(s64), "r"(h), "b"(d64));		\
+}
+#endif /* 64BIT && PWR7 */
+#else
+#define	aix_ld_swap64(s64, d64)						\
+{									\
+	uint32_t *s4 = 0, h, l;	/* initialize to zero for gcc warning */\
+	__asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0"		\
+		: "+r"(s4), "=r"(l), "=r"(h) : "b"(s64));		\
+	d64 = ((uint64_t)h<<32) | l;					\
+}
+
+#define	aix_st_swap64(s64, d64)						\
+{									\
+	uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\
+	l = (s64) & 0xfffffffful, h = (s64) >> 32;			\
+	__asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0"	\
+		: "+r"(s4) : "r"(l), "r"(h), "b"(d64));			\
+}
+#endif /* __64BIT__ */
+#define	aix_ld_swap32(s32, d32)\
+	__asm__("lwbrx %0,0,%1" : "=r"(d32) : "r"(s32))
+#define	aix_st_swap32(s32, d32)\
+	__asm__ volatile("stwbrx %1,0,%0" : : "r"(d32), "r"(s32))
+#define	ld_swap32(s, d) aix_ld_swap32(s, d)
+#define	st_swap32(s, d) aix_st_swap32(s, d)
+#define	ld_swap64(s, d) aix_ld_swap64(s, d)
+#define	st_swap64(s, d) aix_st_swap64(s, d)
+#endif /* __PPC__ || _ARCH_PPC */
+
+#if defined(__sparc)
+#if !defined(__arch64__) && !defined(__sparcv8) && defined(__sparcv9)
+#define	__arch64__
+#endif
+#if defined(__GNUC__) || (defined(__SUNPRO_C) && __SUNPRO_C > 0x590)
+/* need Sun Studio C 5.10 and above for GNU inline assembly */
+#if defined(__arch64__)
+#define	sparc_ld_swap64(s64, d64)					\
+	__asm__("ldxa [%1]0x88,%0" : "=r"(d64) : "r"(s64))
+#define	sparc_st_swap64(s64, d64)					\
+	__asm__ volatile("stxa %0,[%1]0x88" : : "r"(s64), "r"(d64))
+#define	st_swap64(s, d) sparc_st_swap64(s, d)
+#else
+#define	sparc_ld_swap64(s64, d64)					\
+{									\
+	uint32_t *s4, h, l;						\
+	__asm__("add %3,4,%0\n\tlda [%3]0x88,%1\n\tlda [%0]0x88,%2"	\
+		: "+r"(s4), "=r"(l), "=r"(h) : "r"(s64));		\
+	d64 = ((uint64_t)h<<32) | l;					\
+}
+#define	sparc_st_swap64(s64, d64)					\
+{									\
+	uint32_t *s4, h, l;						\
+	l = (s64) & 0xfffffffful, h = (s64) >> 32;			\
+	__asm__ volatile("add %3,4,%0\n\tsta %1,[%3]0x88\n\tsta %2,[%0]0x88"\
+		: "+r"(s4) : "r"(l), "r"(h), "r"(d64));			\
+}
+#endif /* sparc64 */
+#define	sparc_ld_swap32(s32, d32)\
+	__asm__("lda [%1]0x88,%0" : "=r"(d32) : "r"(s32))
+#define	sparc_st_swap32(s32, d32)\
+	__asm__ volatile("sta %0,[%1]0x88" : : "r"(s32), "r"(d32))
+#define	ld_swap32(s, d) sparc_ld_swap32(s, d)
+#define	st_swap32(s, d) sparc_st_swap32(s, d)
+#define	ld_swap64(s, d) sparc_ld_swap64(s, d)
+#define	st_swap64(s, d) sparc_st_swap64(s, d)
+#endif /* GCC || Sun Studio C > 5.9 */
+#endif /* sparc */
+
+/* GCC fallback */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap32)
+#define	ld_swap32(s, d) (d = __builtin_bswap32(*(s)))
+#define	st_swap32(s, d) (*(d) = __builtin_bswap32(s))
+#endif /* GCC4/PGIC && !swap32 */
+#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap64)
+#define	ld_swap64(s, d) (d = __builtin_bswap64(*(s)))
+#define	st_swap64(s, d) (*(d) = __builtin_bswap64(s))
+#endif /* GCC4/PGIC && !swap64 */
+
+/* generic fallback */
+#if !defined(ld_swap32)
+#define	ld_swap32(s, d)							\
+	(d = (*(s) >> 24) | (*(s) >> 8 & 0xff00) |			\
+	(*(s) << 8 & 0xff0000) | (*(s) << 24))
+#define	st_swap32(s, d)							\
+	(*(d) = ((s) >> 24) | ((s) >> 8 & 0xff00) |			\
+	((s) << 8 & 0xff0000) | ((s) << 24))
+#endif
+#if !defined(ld_swap64)
+#define	ld_swap64(s, d)							\
+	(d = (*(s) >> 56) | (*(s) >> 40 & 0xff00) |			\
+	(*(s) >> 24 & 0xff0000) | (*(s) >> 8 & 0xff000000) |		\
+	(*(s) & 0xff000000) << 8 | (*(s) & 0xff0000) << 24 |		\
+	(*(s) & 0xff00) << 40 | *(s) << 56)
+#define	st_swap64(s, d)							\
+	(*(d) = ((s) >> 56) | ((s) >> 40 & 0xff00) |			\
+	((s) >> 24 & 0xff0000) | ((s) >> 8 & 0xff000000) |		\
+	((s) & 0xff000000) << 8 | ((s) & 0xff0000) << 24 |		\
+	((s) & 0xff00) << 40 | (s) << 56)
+#endif
+
+#endif /* MACHINE_IS_BIG_ENDIAN */
+
+
+#if defined(MACHINE_IS_LITTLE_ENDIAN)
+/* replace swaps with simple assignments on little endian systems */
+#undef	ld_swap32
+#undef	st_swap32
+#define	ld_swap32(s, d) (d = *(s))
+#define	st_swap32(s, d) (*(d) = s)
+#undef	ld_swap64
+#undef	st_swap64
+#define	ld_swap64(s, d) (d = *(s))
+#define	st_swap64(s, d) (*(d) = s)
+#endif /* MACHINE_IS_LITTLE_ENDIAN */
+
+#endif /* _CRYPTO_EDONR_BYTEORDER_H */
diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c
index 792ca8825..dbe008190 100644
--- a/module/icp/algs/sha2/sha2.c
+++ b/module/icp/algs/sha2/sha2.c
@@ -38,7 +38,7 @@
 
 #include <sys/zfs_context.h>
 #define	_SHA2_IMPL
-#include <sha2/sha2.h>
+#include <sys/sha2.h>
 #include <sha2/sha2_consts.h>
 
 #define	_RESTRICT_KYWD
@@ -47,18 +47,37 @@
 #include <sys/byteorder.h>
 #define	HAVE_HTONL
 #endif
+#include <sys/isa_defs.h>	/* for _ILP32 */
 
 static void Encode(uint8_t *, uint32_t *, size_t);
+static void Encode64(uint8_t *, uint64_t *, size_t);
 
 #if	defined(__amd64)
+#define	SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1)
 #define	SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1)
+
+void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
 void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num);
+
 #else
 static void SHA256Transform(SHA2_CTX *, const uint8_t *);
+static void SHA512Transform(SHA2_CTX *, const uint8_t *);
 #endif	/* __amd64 */
 
 static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
 
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks are enforced (like 32-bit kernel builds), insert compiler memory
+ * barriers to reduce stack frame size. This can reduce the SHA512Transform()
+ * stack frame usage from 3k to <1k on ARM32, for example.
+ */
+#if defined(_ILP32) || defined(__powerpc)	/* small stack */
+#define	SMALL_STACK_MEMORY_BARRIER	asm volatile("": : :"memory");
+#else
+#define	SMALL_STACK_MEMORY_BARRIER
+#endif
+
 /* Ch and Maj are the basic SHA2 functions. */
 #define	Ch(b, c, d)	(((b) & (c)) ^ ((~b) & (d)))
 #define	Maj(b, c, d)	(((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d)))
@@ -82,6 +101,18 @@ static uint8_t PADDING[128] = { 0x80, /* all zeros */ };
 	T2 = BIGSIGMA0_256(a) + Maj(a, b, c);				\
 	h = T1 + T2
 
+/* SHA384/512 Functions */
+#define	BIGSIGMA0(x)	(ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39))
+#define	BIGSIGMA1(x)	(ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41))
+#define	SIGMA0(x)	(ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7))
+#define	SIGMA1(x)	(ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6))
+#define	SHA512ROUND(a, b, c, d, e, f, g, h, i, w)			\
+	T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w;	\
+	d += T1;							\
+	T2 = BIGSIGMA0(a) + Maj(a, b, c);				\
+	h = T1 + T2;							\
+	SMALL_STACK_MEMORY_BARRIER;
+
 /*
  * sparc optimization:
  *
@@ -130,6 +161,33 @@ SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk)
 	uint32_t w8, w9, w10, w11, w12, w13, w14, w15;
 	uint32_t T1, T2;
 
+#if	defined(__sparc)
+	static const uint32_t sha256_consts[] = {
+		SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2,
+		SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5,
+		SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8,
+		SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11,
+		SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14,
+		SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17,
+		SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20,
+		SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23,
+		SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26,
+		SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29,
+		SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32,
+		SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35,
+		SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38,
+		SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41,
+		SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44,
+		SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47,
+		SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50,
+		SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53,
+		SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56,
+		SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59,
+		SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62,
+		SHA256_CONST_63
+	};
+#endif	/* __sparc */
+
 	if ((uintptr_t)blk & 0x3) {		/* not 4-byte aligned? */
 		bcopy(blk, ctx->buf_un.buf32,  sizeof (ctx->buf_un.buf32));
 		blk = (uint8_t *)ctx->buf_un.buf32;
@@ -292,6 +350,256 @@ SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk)
 	ctx->state.s32[6] += g;
 	ctx->state.s32[7] += h;
 }
+
+
+/* SHA384 and SHA512 Transform */
+
+static void
+SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk)
+{
+
+	uint64_t a = ctx->state.s64[0];
+	uint64_t b = ctx->state.s64[1];
+	uint64_t c = ctx->state.s64[2];
+	uint64_t d = ctx->state.s64[3];
+	uint64_t e = ctx->state.s64[4];
+	uint64_t f = ctx->state.s64[5];
+	uint64_t g = ctx->state.s64[6];
+	uint64_t h = ctx->state.s64[7];
+
+	uint64_t w0, w1, w2, w3, w4, w5, w6, w7;
+	uint64_t w8, w9, w10, w11, w12, w13, w14, w15;
+	uint64_t T1, T2;
+
+#if	defined(__sparc)
+	static const uint64_t sha512_consts[] = {
+		SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2,
+		SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5,
+		SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8,
+		SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11,
+		SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14,
+		SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17,
+		SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20,
+		SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23,
+		SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26,
+		SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29,
+		SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32,
+		SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35,
+		SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38,
+		SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41,
+		SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44,
+		SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47,
+		SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50,
+		SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53,
+		SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56,
+		SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59,
+		SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62,
+		SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65,
+		SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68,
+		SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71,
+		SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74,
+		SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77,
+		SHA512_CONST_78, SHA512_CONST_79
+	};
+#endif	/* __sparc */
+
+
+	if ((uintptr_t)blk & 0x7) {		/* not 8-byte aligned? */
+		bcopy(blk, ctx->buf_un.buf64,  sizeof (ctx->buf_un.buf64));
+		blk = (uint8_t *)ctx->buf_un.buf64;
+	}
+
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w0 =  LOAD_BIG_64(blk + 8 * 0);
+	SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w1 =  LOAD_BIG_64(blk + 8 * 1);
+	SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w2 =  LOAD_BIG_64(blk + 8 * 2);
+	SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w3 =  LOAD_BIG_64(blk + 8 * 3);
+	SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w4 =  LOAD_BIG_64(blk + 8 * 4);
+	SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w5 =  LOAD_BIG_64(blk + 8 * 5);
+	SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w6 =  LOAD_BIG_64(blk + 8 * 6);
+	SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w7 =  LOAD_BIG_64(blk + 8 * 7);
+	SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w8 =  LOAD_BIG_64(blk + 8 * 8);
+	SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w9 =  LOAD_BIG_64(blk + 8 * 9);
+	SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w10 =  LOAD_BIG_64(blk + 8 * 10);
+	SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w11 =  LOAD_BIG_64(blk + 8 * 11);
+	SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w12 =  LOAD_BIG_64(blk + 8 * 12);
+	SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w13 =  LOAD_BIG_64(blk + 8 * 13);
+	SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w14 =  LOAD_BIG_64(blk + 8 * 14);
+	SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14);
+	/* LINTED E_BAD_PTR_CAST_ALIGN */
+	w15 =  LOAD_BIG_64(blk + 8 * 15);
+	SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15);
+
+	w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0);
+	w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1);
+	w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2);
+	w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3);
+	w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4);
+	w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5);
+	w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6);
+	w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7);
+	w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8;
+	SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8);
+	w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9;
+	SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9);
+	w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10;
+	SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10);
+	w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11;
+	SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11);
+	w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12;
+	SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12);
+	w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13;
+	SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13);
+	w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14;
+	SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14);
+	w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15;
+	SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15);
+
+	ctx->state.s64[0] += a;
+	ctx->state.s64[1] += b;
+	ctx->state.s64[2] += c;
+	ctx->state.s64[3] += d;
+	ctx->state.s64[4] += e;
+	ctx->state.s64[5] += f;
+	ctx->state.s64[6] += g;
+	ctx->state.s64[7] += h;
+
+}
 #endif	/* !__amd64 */
 
 
@@ -311,14 +619,56 @@ Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input,
 {
 	size_t		i, j;
 
-	for (i = 0, j = 0; j < len; i++, j += 4) {
-		output[j]	= (input[i] >> 24) & 0xff;
-		output[j + 1]	= (input[i] >> 16) & 0xff;
-		output[j + 2]	= (input[i] >>  8) & 0xff;
-		output[j + 3]	= input[i] & 0xff;
+#if	defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint32_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint32_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif	/* little endian -- will work on big endian, but slowly */
+		for (i = 0, j = 0; j < len; i++, j += 4) {
+			output[j]	= (input[i] >> 24) & 0xff;
+			output[j + 1]	= (input[i] >> 16) & 0xff;
+			output[j + 2]	= (input[i] >>  8) & 0xff;
+			output[j + 3]	= input[i] & 0xff;
+		}
+#if	defined(__sparc)
 	}
+#endif
 }
 
+static void
+Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input,
+    size_t len)
+{
+	size_t		i, j;
+
+#if	defined(__sparc)
+	if (IS_P2ALIGNED(output, sizeof (uint64_t))) {
+		for (i = 0, j = 0; j < len; i++, j += 8) {
+			/* LINTED E_BAD_PTR_CAST_ALIGN */
+			*((uint64_t *)(output + j)) = input[i];
+		}
+	} else {
+#endif	/* little endian -- will work on big endian, but slowly */
+		for (i = 0, j = 0; j < len; i++, j += 8) {
+
+			output[j]	= (input[i] >> 56) & 0xff;
+			output[j + 1]	= (input[i] >> 48) & 0xff;
+			output[j + 2]	= (input[i] >> 40) & 0xff;
+			output[j + 3]	= (input[i] >> 32) & 0xff;
+			output[j + 4]	= (input[i] >> 24) & 0xff;
+			output[j + 5]	= (input[i] >> 16) & 0xff;
+			output[j + 6]	= (input[i] >>  8) & 0xff;
+			output[j + 7]	= input[i] & 0xff;
+		}
+#if	defined(__sparc)
+	}
+#endif
+}
+
+
 void
 SHA2Init(uint64_t mech, SHA2_CTX *ctx)
 {
@@ -336,22 +686,86 @@ SHA2Init(uint64_t mech, SHA2_CTX *ctx)
 		ctx->state.s32[6] = 0x1f83d9abU;
 		ctx->state.s32[7] = 0x5be0cd19U;
 		break;
+	case SHA384_MECH_INFO_TYPE:
+	case SHA384_HMAC_MECH_INFO_TYPE:
+	case SHA384_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL;
+		ctx->state.s64[1] = 0x629a292a367cd507ULL;
+		ctx->state.s64[2] = 0x9159015a3070dd17ULL;
+		ctx->state.s64[3] = 0x152fecd8f70e5939ULL;
+		ctx->state.s64[4] = 0x67332667ffc00b31ULL;
+		ctx->state.s64[5] = 0x8eb44a8768581511ULL;
+		ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL;
+		ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL;
+		break;
+	case SHA512_MECH_INFO_TYPE:
+	case SHA512_HMAC_MECH_INFO_TYPE:
+	case SHA512_HMAC_GEN_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x6a09e667f3bcc908ULL;
+		ctx->state.s64[1] = 0xbb67ae8584caa73bULL;
+		ctx->state.s64[2] = 0x3c6ef372fe94f82bULL;
+		ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL;
+		ctx->state.s64[4] = 0x510e527fade682d1ULL;
+		ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL;
+		ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL;
+		ctx->state.s64[7] = 0x5be0cd19137e2179ULL;
+		break;
+	case SHA512_224_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x8C3D37C819544DA2ULL;
+		ctx->state.s64[1] = 0x73E1996689DCD4D6ULL;
+		ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL;
+		ctx->state.s64[3] = 0x679DD514582F9FCFULL;
+		ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL;
+		ctx->state.s64[5] = 0x77E36F7304C48942ULL;
+		ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL;
+		ctx->state.s64[7] = 0x1112E6AD91D692A1ULL;
+		break;
+	case SHA512_256_MECH_INFO_TYPE:
+		ctx->state.s64[0] = 0x22312194FC2BF72CULL;
+		ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL;
+		ctx->state.s64[2] = 0x2393B86B6F53B151ULL;
+		ctx->state.s64[3] = 0x963877195940EABDULL;
+		ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL;
+		ctx->state.s64[5] = 0xBE5E1E2553863992ULL;
+		ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL;
+		ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL;
+		break;
+#ifdef _KERNEL
 	default:
 		cmn_err(CE_PANIC,
 		    "sha2_init: failed to find a supported algorithm: 0x%x",
 		    (uint32_t)mech);
+
+#endif /* _KERNEL */
 	}
 
 	ctx->algotype = (uint32_t)mech;
 	ctx->count.c64[0] = ctx->count.c64[1] = 0;
 }
 
+#ifndef _KERNEL
+
+// #pragma inline(SHA256Init, SHA384Init, SHA512Init)
 void
 SHA256Init(SHA256_CTX *ctx)
 {
 	SHA2Init(SHA256, ctx);
 }
 
+void
+SHA384Init(SHA384_CTX *ctx)
+{
+	SHA2Init(SHA384, ctx);
+}
+
+void
+SHA512Init(SHA512_CTX *ctx)
+{
+	SHA2Init(SHA512, ctx);
+}
+
+#endif /* _KERNEL */
+
 /*
  * SHA2Update()
  *
@@ -422,6 +836,8 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
 			bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len);
 			if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE)
 				SHA256Transform(ctx, ctx->buf_un.buf8);
+			else
+				SHA512Transform(ctx, ctx->buf_un.buf8);
 
 			i = buf_len;
 		}
@@ -431,6 +847,10 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
 			for (; i + buf_limit - 1 < input_len; i += buf_limit) {
 				SHA256Transform(ctx, &input[i]);
 			}
+		} else {
+			for (; i + buf_limit - 1 < input_len; i += buf_limit) {
+				SHA512Transform(ctx, &input[i]);
+			}
 		}
 
 #else
@@ -441,6 +861,13 @@ SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len)
 				    block_count);
 				i += block_count << 6;
 			}
+		} else {
+			block_count = (input_len - i) >> 7;
+			if (block_count > 0) {
+				SHA512TransformBlocks(ctx, &input[i],
+				    block_count);
+				i += block_count << 7;
+			}
 		}
 #endif	/* !__amd64 */
 
@@ -479,6 +906,7 @@ void
 SHA2Final(void *digest, SHA2_CTX *ctx)
 {
 	uint8_t		bitcount_be[sizeof (ctx->count.c32)];
+	uint8_t		bitcount_be64[sizeof (ctx->count.c64)];
 	uint32_t	index;
 	uint32_t	algotype = ctx->algotype;
 
@@ -488,8 +916,45 @@ SHA2Final(void *digest, SHA2_CTX *ctx)
 		SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index);
 		SHA2Update(ctx, bitcount_be, sizeof (bitcount_be));
 		Encode(digest, ctx->state.s32, sizeof (ctx->state.s32));
+	} else {
+		index  = (ctx->count.c64[1] >> 3) & 0x7f;
+		Encode64(bitcount_be64, ctx->count.c64,
+		    sizeof (bitcount_be64));
+		SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index);
+		SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64));
+		if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) {
+			ctx->state.s64[6] = ctx->state.s64[7] = 0;
+			Encode64(digest, ctx->state.s64,
+			    sizeof (uint64_t) * 6);
+		} else if (algotype == SHA512_224_MECH_INFO_TYPE) {
+			uint8_t last[sizeof (uint64_t)];
+			/*
+			 * Since SHA-512/224 doesn't align well to 64-bit
+			 * boundaries, we must do the encoding in three steps:
+			 * 1) encode the three 64-bit words that fit neatly
+			 * 2) encode the last 64-bit word to a temp buffer
+			 * 3) chop out the lower 32-bits from the temp buffer
+			 *    and append them to the digest
+			 */
+			Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3);
+			Encode64(last, &ctx->state.s64[3], sizeof (uint64_t));
+			bcopy(last, (uint8_t *)digest + 24, 4);
+		} else if (algotype == SHA512_256_MECH_INFO_TYPE) {
+			Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4);
+		} else {
+			Encode64(digest, ctx->state.s64,
+			    sizeof (ctx->state.s64));
+		}
 	}
 
 	/* zeroize sensitive information */
 	bzero(ctx, sizeof (*ctx));
 }
+
+
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(SHA2Init);
+EXPORT_SYMBOL(SHA2Update);
+EXPORT_SYMBOL(SHA2Final);
+#endif
diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE b/module/icp/algs/skein/THIRDPARTYLICENSE
new file mode 100644
index 000000000..b7434fd17
--- /dev/null
+++ b/module/icp/algs/skein/THIRDPARTYLICENSE
@@ -0,0 +1,3 @@
+Implementation of the Skein hash function.
+Source code author: Doug Whiting, 2008.
+This algorithm and source code is released to the public domain.
diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE.descrip b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
new file mode 100644
index 000000000..0ae89cfdf
--- /dev/null
+++ b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip
@@ -0,0 +1 @@
+LICENSE TERMS OF SKEIN HASH ALGORITHM IMPLEMENTATION
diff --git a/module/icp/algs/skein/skein.c b/module/icp/algs/skein/skein.c
new file mode 100644
index 000000000..0981eee08
--- /dev/null
+++ b/module/icp/algs/skein/skein.c
@@ -0,0 +1,921 @@
+/*
+ * Implementation of the Skein hash function.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#define	SKEIN_PORT_CODE		/* instantiate any code in skein_port.h */
+
+#include <sys/types.h>
+#include <sys/note.h>
+#include <sys/skein.h>		/* get the Skein API definitions   */
+#include "skein_impl.h"		/* get internal definitions */
+
+/* External function to process blkCnt (nonzero) full block(s) of data. */
+void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd);
+
+/* 256-bit Skein */
+/* init the context for a straight hashing operation  */
+int
+Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN_256_STATE_BYTES];
+		uint64_t w[SKEIN_256_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 256:
+		bcopy(SKEIN_256_IV_256, ctx->X, sizeof (ctx->X));
+		break;
+	case 224:
+		bcopy(SKEIN_256_IV_224, ctx->X, sizeof (ctx->X));
+		break;
+	case 160:
+		bcopy(SKEIN_256_IV_160, ctx->X, sizeof (ctx->X));
+		break;
+	case 128:
+		bcopy(SKEIN_256_IV_128, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/* here if there is no precomputed IV value available */
+		/*
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen.
+	 * Set up to process the data message portion of the hash (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_256_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN_256_STATE_BYTES];
+		uint64_t w[SKEIN_256_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		/* hash the key */
+		(void) Skein_256_Update(ctx, key, keyBytes);
+		/* put result into cfg.b[] */
+		(void) Skein_256_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN_256_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);	/* hash result length in bits */
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(256, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES);
+			Skein_256_Process_Block(ctx, ctx->b, 1,
+			    SKEIN_256_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from input
+		 * message data
+		 */
+		if (msgByteCnt > SKEIN_256_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES;
+			Skein_256_Process_Block(ctx, msg, n,
+			    SKEIN_256_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_256_BLOCK_BYTES;
+			msg += n * SKEIN_256_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_256_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_256_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* 512-bit Skein */
+
+/* init the context for a straight hashing operation  */
+int
+Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN_512_STATE_BYTES];
+		uint64_t w[SKEIN_512_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 512:
+		bcopy(SKEIN_512_IV_512, ctx->X, sizeof (ctx->X));
+		break;
+	case 384:
+		bcopy(SKEIN_512_IV_384, ctx->X, sizeof (ctx->X));
+		break;
+	case 256:
+		bcopy(SKEIN_512_IV_256, ctx->X, sizeof (ctx->X));
+		break;
+	case 224:
+		bcopy(SKEIN_512_IV_224, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/*
+		 * here if there is no precomputed IV value available
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen. Set up to process the data message portion of the
+	 * hash (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein_512_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN_512_STATE_BYTES];
+		uint64_t w[SKEIN_512_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		(void) Skein_512_Update(ctx, key, keyBytes); /* hash the key */
+		/* put result into cfg.b[] */
+		(void) Skein_512_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN_512_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	cfg.w[1] = Skein_Swap64(hashBitLen);	/* hash result length in bits */
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(512, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES);
+			Skein_512_Process_Block(ctx, ctx->b, 1,
+			    SKEIN_512_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from input
+		 * message data
+		 */
+		if (msgByteCnt > SKEIN_512_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES;
+			Skein_512_Process_Block(ctx, msg, n,
+			    SKEIN_512_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN_512_BLOCK_BYTES;
+			msg += n * SKEIN_512_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(512, &ctx->h, n,
+		    hashVal + i * SKEIN_512_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* 1024-bit Skein */
+
+/* init the context for a straight hashing operation  */
+int
+Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen)
+{
+	union {
+		uint8_t b[SKEIN1024_STATE_BYTES];
+		uint64_t w[SKEIN1024_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+
+	switch (hashBitLen) {	/* use pre-computed values, where available */
+#ifndef	SKEIN_NO_PRECOMP
+	case 512:
+		bcopy(SKEIN1024_IV_512, ctx->X, sizeof (ctx->X));
+		break;
+	case 384:
+		bcopy(SKEIN1024_IV_384, ctx->X, sizeof (ctx->X));
+		break;
+	case 1024:
+		bcopy(SKEIN1024_IV_1024, ctx->X, sizeof (ctx->X));
+		break;
+#endif
+	default:
+		/* here if there is no precomputed IV value available */
+		/*
+		 * build/process the config block, type == CONFIG (could be
+		 * precomputed)
+		 */
+		/* set tweaks: T0=0; T1=CFG | FINAL */
+		Skein_Start_New_Type(ctx, CFG_FINAL);
+
+		/* set the schema, version */
+		cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+		/* hash result length in bits */
+		cfg.w[1] = Skein_Swap64(hashBitLen);
+		cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL);
+		/* zero pad config block */
+		bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0]));
+
+		/* compute the initial chaining values from config block */
+		/* zero the chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+		break;
+	}
+
+	/*
+	 * The chaining vars ctx->X are now initialized for the given
+	 * hashBitLen. Set up to process the data message portion of the hash
+	 * (default)
+	 */
+	Skein_Start_New_Type(ctx, MSG);	/* T0=0, T1= MSG type */
+
+	return (SKEIN_SUCCESS);
+}
+
+/* init the context for a MAC and/or tree hash operation */
+/*
+ * [identical to Skein1024_Init() when keyBytes == 0 &&
+ * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL]
+ */
+int
+Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo,
+    const uint8_t *key, size_t keyBytes)
+{
+	union {
+		uint8_t b[SKEIN1024_STATE_BYTES];
+		uint64_t w[SKEIN1024_STATE_WORDS];
+	} cfg;			/* config block */
+
+	Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN);
+	Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL);
+
+	/* compute the initial chaining values ctx->X[], based on key */
+	if (keyBytes == 0) {	/* is there a key? */
+		/* no key: use all zeroes as key for config block */
+		bzero(ctx->X, sizeof (ctx->X));
+	} else {		/* here to pre-process a key */
+		Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X));
+		/* do a mini-Init right here */
+		/* set output hash bit count = state size */
+		ctx->h.hashBitLen = 8 * sizeof (ctx->X);
+		/* set tweaks: T0 = 0; T1 = KEY type */
+		Skein_Start_New_Type(ctx, KEY);
+		/* zero the initial chaining variables */
+		bzero(ctx->X, sizeof (ctx->X));
+		(void) Skein1024_Update(ctx, key, keyBytes); /* hash the key */
+		/* put result into cfg.b[] */
+		(void) Skein1024_Final_Pad(ctx, cfg.b);
+		/* copy over into ctx->X[] */
+		bcopy(cfg.b, ctx->X, sizeof (cfg.b));
+#if	SKEIN_NEED_SWAP
+		{
+			uint_t i;
+			/* convert key bytes to context words */
+			for (i = 0; i < SKEIN1024_STATE_WORDS; i++)
+				ctx->X[i] = Skein_Swap64(ctx->X[i]);
+		}
+#endif
+	}
+	/*
+	 * build/process the config block, type == CONFIG (could be
+	 * precomputed for each key)
+	 */
+	ctx->h.hashBitLen = hashBitLen;	/* output hash bit count */
+	Skein_Start_New_Type(ctx, CFG_FINAL);
+
+	bzero(&cfg.w, sizeof (cfg.w));	/* pre-pad cfg.w[] with zeroes */
+	cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER);
+	/* hash result length in bits */
+	cfg.w[1] = Skein_Swap64(hashBitLen);
+	/* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */
+	cfg.w[2] = Skein_Swap64(treeInfo);
+
+	Skein_Show_Key(1024, &ctx->h, key, keyBytes);
+
+	/* compute the initial chaining values from config block */
+	Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN);
+
+	/* The chaining vars ctx->X are now initialized */
+	/* Set up to process the data message portion of the hash (default) */
+	ctx->h.bCnt = 0;	/* buffer b[] starts out empty */
+	Skein_Start_New_Type(ctx, MSG);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* process the input bytes */
+int
+Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt)
+{
+	size_t n;
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* process full blocks, if any */
+	if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) {
+		/* finish up any buffered message data */
+		if (ctx->h.bCnt) {
+			/* # bytes free in buffer b[] */
+			n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt;
+			if (n) {
+				/* check on our logic here */
+				Skein_assert(n < msgByteCnt);
+				bcopy(msg, &ctx->b[ctx->h.bCnt], n);
+				msgByteCnt -= n;
+				msg += n;
+				ctx->h.bCnt += n;
+			}
+			Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES);
+			Skein1024_Process_Block(ctx, ctx->b, 1,
+			    SKEIN1024_BLOCK_BYTES);
+			ctx->h.bCnt = 0;
+		}
+		/*
+		 * now process any remaining full blocks, directly from
+		 * input message data
+		 */
+		if (msgByteCnt > SKEIN1024_BLOCK_BYTES) {
+			/* number of full blocks to process */
+			n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES;
+			Skein1024_Process_Block(ctx, msg, n,
+			    SKEIN1024_BLOCK_BYTES);
+			msgByteCnt -= n * SKEIN1024_BLOCK_BYTES;
+			msg += n * SKEIN1024_BLOCK_BYTES;
+		}
+		Skein_assert(ctx->h.bCnt == 0);
+	}
+
+	/* copy any remaining source message data bytes into b[] */
+	if (msgByteCnt) {
+		Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES);
+		bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt);
+		ctx->h.bCnt += msgByteCnt;
+	}
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the result */
+int
+Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN1024_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+
+	/* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(1024, &ctx->h, n,
+		    hashVal + i * SKEIN1024_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* Functions to support MAC/tree hashing */
+/* (this code is identical for Optimized and Reference versions) */
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_256_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_256_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;	/* tag as the final block */
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN_512_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_512_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+/* finalize the hash computation and output the block, no OUTPUT stage */
+int
+Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* tag as the final block */
+	ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL;
+	/* zero pad b[] if necessary */
+	if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES)
+		bzero(&ctx->b[ctx->h.bCnt],
+		    SKEIN1024_BLOCK_BYTES - ctx->h.bCnt);
+	/* process the final block */
+	Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt);
+
+	/* "output" the state bytes */
+	Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN1024_BLOCK_BYTES);
+
+	return (SKEIN_SUCCESS);
+}
+
+#if	SKEIN_TREE_HASH
+/* just do the OUTPUT stage */
+int
+Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_256_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_256_BLOCK_BYTES;
+		if (n >= SKEIN_256_BLOCK_BYTES)
+			n = SKEIN_256_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_256_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN_512_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN_512_BLOCK_BYTES;
+		if (n >= SKEIN_512_BLOCK_BYTES)
+			n = SKEIN_512_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN_512_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+
+/* just do the OUTPUT stage */
+int
+Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal)
+{
+	size_t i, n, byteCnt;
+	uint64_t X[SKEIN1024_STATE_WORDS];
+
+	/* catch uninitialized context */
+	Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL);
+
+	/* now output the result */
+	/* total number of output bytes */
+	byteCnt = (ctx->h.hashBitLen + 7) >> 3;
+
+	/* run Threefish in "counter mode" to generate output */
+	/* zero out b[], so it can hold the counter */
+	bzero(ctx->b, sizeof (ctx->b));
+	/* keep a local copy of counter mode "key" */
+	bcopy(ctx->X, X, sizeof (X));
+	for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) {
+		/* build the counter block */
+		uint64_t tmp = Skein_Swap64((uint64_t)i);
+		bcopy(&tmp, ctx->b, sizeof (tmp));
+		Skein_Start_New_Type(ctx, OUT_FINAL);
+		/* run "counter mode" */
+		Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t));
+		/* number of output bytes left to go */
+		n = byteCnt - i * SKEIN1024_BLOCK_BYTES;
+		if (n >= SKEIN1024_BLOCK_BYTES)
+			n = SKEIN1024_BLOCK_BYTES;
+		Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES,
+		    ctx->X, n);	/* "output" the ctr mode bytes */
+		Skein_Show_Final(256, &ctx->h, n,
+		    hashVal + i * SKEIN1024_BLOCK_BYTES);
+		/* restore the counter mode key for next time */
+		bcopy(X, ctx->X, sizeof (X));
+	}
+	return (SKEIN_SUCCESS);
+}
+#endif
+
+#ifdef _KERNEL
+EXPORT_SYMBOL(Skein_512_Init);
+EXPORT_SYMBOL(Skein_512_InitExt);
+EXPORT_SYMBOL(Skein_512_Update);
+EXPORT_SYMBOL(Skein_512_Final);
+#endif
diff --git a/module/icp/algs/skein/skein_block.c b/module/icp/algs/skein/skein_block.c
new file mode 100644
index 000000000..d2e811963
--- /dev/null
+++ b/module/icp/algs/skein/skein_block.c
@@ -0,0 +1,793 @@
+/*
+ * Implementation of the Skein block functions.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ * Compile-time switches:
+ *  SKEIN_USE_ASM  -- set bits (256/512/1024) to select which
+ *                    versions use ASM code for block processing
+ *                    [default: use C for all block sizes]
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#include <sys/skein.h>
+#include "skein_impl.h"
+#include <sys/isa_defs.h>	/* for _ILP32 */
+
+#ifndef	SKEIN_USE_ASM
+#define	SKEIN_USE_ASM	(0)	/* default is all C code (no ASM) */
+#endif
+
+#ifndef	SKEIN_LOOP
+/*
+ * The low-level checksum routines use a lot of stack space. On systems where
+ * small stacks frame are enforced (like 32-bit kernel builds), do not unroll
+ * checksum calculations to save stack space.
+ *
+ * Even with no loops unrolled, we still can exceed the 1k stack frame limit
+ * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32).  We can
+ * safely ignore it though, since that the checksum functions will be called
+ * from a worker thread that won't be using much stack.  That's why we have
+ * the #pragma here to ignore the warning.
+ */
+#if defined(_ILP32) || defined(__powerpc)	/* Assume small stack */
+#pragma GCC diagnostic ignored "-Wframe-larger-than="
+/*
+ * We're running on 32-bit, don't unroll loops to save stack frame space
+ *
+ * Due to the ways the calculations on SKEIN_LOOP are done in
+ * Skein_*_Process_Block(), a value of 111 disables unrolling loops
+ * in any of those functions.
+ */
+#define	SKEIN_LOOP 111
+#else
+/* We're compiling with large stacks */
+#define	SKEIN_LOOP 001		/* default: unroll 256 and 512, but not 1024 */
+#endif
+#endif
+
+/* some useful definitions for code here */
+#define	BLK_BITS	(WCNT*64)
+#define	KW_TWK_BASE	(0)
+#define	KW_KEY_BASE	(3)
+#define	ks		(kw + KW_KEY_BASE)
+#define	ts		(kw + KW_TWK_BASE)
+
+/* no debugging in Illumos version */
+#define	DebugSaveTweak(ctx)
+
+/* Skein_256 */
+#if	!(SKEIN_USE_ASM & 256)
+
+void
+Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{				/* do it in C */
+	enum {
+		WCNT = SKEIN_256_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN_256_ROUNDS_TOTAL / 8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
+#else
+#define	SKEIN_UNROLL_256 (0)
+#endif
+
+#if	SKEIN_UNROLL_256
+#if	(RCNT % SKEIN_UNROLL_256)
+#error "Invalid SKEIN_UNROLL_256"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+	/* local copy of context vars, for speed */
+	uint64_t X0, X1, X2, X3;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[4];
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+#endif
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X0 = w[0] + ks[0];	/* do the first full key injection */
+		X1 = w[1] + ks[1] + ts[0];
+		X2 = w[2] + ks[2] + ts[1];
+		X3 = w[3] + ks[3];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);	/* show starting state values */
+
+		blkPtr += SKEIN_256_BLOCK_BYTES;
+
+		/* run the rounds */
+
+#define	Round256(p0, p1, p2, p3, ROT, rNum)                          \
+    X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
+    X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
+
+#if	SKEIN_UNROLL_256 == 0
+#define	R256(p0, p1, p2, p3, ROT, rNum)		/* fully unrolled */	\
+    Round256(p0, p1, p2, p3, ROT, rNum)					\
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define	I256(R)								\
+    X0 += ks[((R) + 1) % 5];	/* inject the key schedule value */	\
+    X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3];			\
+    X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3];			\
+    X3 += ks[((R) + 4) % 5] + (R) + 1;					\
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R256(p0, p1, p2, p3, ROT, rNum)                             \
+    Round256(p0, p1, p2, p3, ROT, rNum)                             \
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define	I256(R)								\
+	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X1 += ks[r + (R) + 1] + ts[r + (R) + 0];			\
+	X2 += ks[r + (R) + 2] + ts[r + (R) + 1];			\
+	X3 += ks[r + (R) + 3] + r + (R);				\
+	ks[r + (R) + 4] = ks[r + (R) - 1];   /* rotate key schedule */	\
+    ts[r + (R) + 2] = ts[r + (R) - 1];					\
+    Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop thru it */
+		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
+#endif
+		{
+#define	R256_8_rounds(R)                         \
+	R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1);  \
+	R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2);  \
+	R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3);  \
+	R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4);  \
+	I256(2 * (R));                           \
+	R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5);  \
+	R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6);  \
+	R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7);  \
+	R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8);  \
+	I256(2 * (R) + 1);
+
+			R256_8_rounds(0);
+
+#define	R256_Unroll_R(NN) \
+	((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
+	(SKEIN_UNROLL_256 > (NN)))
+
+#if	R256_Unroll_R(1)
+			R256_8_rounds(1);
+#endif
+#if	R256_Unroll_R(2)
+			R256_8_rounds(2);
+#endif
+#if	R256_Unroll_R(3)
+			R256_8_rounds(3);
+#endif
+#if	R256_Unroll_R(4)
+			R256_8_rounds(4);
+#endif
+#if	R256_Unroll_R(5)
+			R256_8_rounds(5);
+#endif
+#if	R256_Unroll_R(6)
+			R256_8_rounds(6);
+#endif
+#if	R256_Unroll_R(7)
+			R256_8_rounds(7);
+#endif
+#if	R256_Unroll_R(8)
+			R256_8_rounds(8);
+#endif
+#if	R256_Unroll_R(9)
+			R256_8_rounds(9);
+#endif
+#if	R256_Unroll_R(10)
+			R256_8_rounds(10);
+#endif
+#if	R256_Unroll_R(11)
+			R256_8_rounds(11);
+#endif
+#if	R256_Unroll_R(12)
+			R256_8_rounds(12);
+#endif
+#if	R256_Unroll_R(13)
+			R256_8_rounds(13);
+#endif
+#if	R256_Unroll_R(14)
+			R256_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_256 > 14)
+#error  "need more unrolling in Skein_256_Process_Block"
+#endif
+		}
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_256_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein_256_Process_Block);
+}
+
+uint_t
+Skein_256_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_256);
+}
+#endif
+#endif
+
+/* Skein_512 */
+#if	!(SKEIN_USE_ASM & 512)
+void
+Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{				/* do it in C */
+	enum {
+		WCNT = SKEIN_512_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN_512_ROUNDS_TOTAL / 8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
+#else
+#define	SKEIN_UNROLL_512 (0)
+#endif
+
+#if	SKEIN_UNROLL_512
+#if	(RCNT % SKEIN_UNROLL_512)
+#error "Invalid SKEIN_UNROLL_512"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+	/* local copy of vars, for speed */
+	uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[8];
+	Xptr[0] = &X0;
+	Xptr[1] = &X1;
+	Xptr[2] = &X2;
+	Xptr[3] = &X3;
+	Xptr[4] = &X4;
+	Xptr[5] = &X5;
+	Xptr[6] = &X6;
+	Xptr[7] = &X7;
+#endif
+
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X0 = w[0] + ks[0];	/* do the first full key injection */
+		X1 = w[1] + ks[1];
+		X2 = w[2] + ks[2];
+		X3 = w[3] + ks[3];
+		X4 = w[4] + ks[4];
+		X5 = w[5] + ks[5] + ts[0];
+		X6 = w[6] + ks[6] + ts[1];
+		X7 = w[7] + ks[7];
+
+		blkPtr += SKEIN_512_BLOCK_BYTES;
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);
+		/* run the rounds */
+#define	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
+
+#if	SKEIN_UNROLL_512 == 0
+#define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)	/* unrolled */	\
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
+
+#define	I512(R)								\
+	X0 += ks[((R) + 1) % 9];	/* inject the key schedule value */\
+	X1 += ks[((R) + 2) % 9];					\
+	X2 += ks[((R) + 3) % 9];					\
+	X3 += ks[((R) + 4) % 9];					\
+	X4 += ks[((R) + 5) % 9];					\
+	X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3];			\
+	X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3];			\
+	X7 += ks[((R) + 8) % 9] + (R) + 1;				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)			\
+	Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum)		\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
+
+#define	I512(R)								\
+	X0 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X1 += ks[r + (R) + 1];						\
+	X2 += ks[r + (R) + 2];						\
+	X3 += ks[r + (R) + 3];						\
+	X4 += ks[r + (R) + 4];						\
+	X5 += ks[r + (R) + 5] + ts[r + (R) + 0];			\
+	X6 += ks[r + (R) + 6] + ts[r + (R) + 1];			\
+	X7 += ks[r + (R) + 7] + r + (R);				\
+	ks[r + (R)+8] = ks[r + (R) - 1];	/* rotate key schedule */\
+	ts[r + (R)+2] = ts[r + (R) - 1];				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop thru it */
+		for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
+#endif				/* end of looped code definitions */
+		{
+#define	R512_8_rounds(R)	/* do 8 full rounds */			\
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1);		\
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2);		\
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3);		\
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4);		\
+	I512(2 * (R));							\
+	R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5);		\
+	R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6);		\
+	R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7);		\
+	R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8);		\
+	I512(2*(R) + 1);		/* and key injection */
+
+			R512_8_rounds(0);
+
+#define	R512_Unroll_R(NN) \
+	((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
+	(SKEIN_UNROLL_512 > (NN)))
+
+#if	R512_Unroll_R(1)
+			R512_8_rounds(1);
+#endif
+#if	R512_Unroll_R(2)
+			R512_8_rounds(2);
+#endif
+#if	R512_Unroll_R(3)
+			R512_8_rounds(3);
+#endif
+#if	R512_Unroll_R(4)
+			R512_8_rounds(4);
+#endif
+#if	R512_Unroll_R(5)
+			R512_8_rounds(5);
+#endif
+#if	R512_Unroll_R(6)
+			R512_8_rounds(6);
+#endif
+#if	R512_Unroll_R(7)
+			R512_8_rounds(7);
+#endif
+#if	R512_Unroll_R(8)
+			R512_8_rounds(8);
+#endif
+#if	R512_Unroll_R(9)
+			R512_8_rounds(9);
+#endif
+#if	R512_Unroll_R(10)
+			R512_8_rounds(10);
+#endif
+#if	R512_Unroll_R(11)
+			R512_8_rounds(11);
+#endif
+#if	R512_Unroll_R(12)
+			R512_8_rounds(12);
+#endif
+#if	R512_Unroll_R(13)
+			R512_8_rounds(13);
+#endif
+#if	R512_Unroll_R(14)
+			R512_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_512 > 14)
+#error "need more unrolling in Skein_512_Process_Block"
+#endif
+		}
+
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+		ctx->X[0] = X0 ^ w[0];
+		ctx->X[1] = X1 ^ w[1];
+		ctx->X[2] = X2 ^ w[2];
+		ctx->X[3] = X3 ^ w[3];
+		ctx->X[4] = X4 ^ w[4];
+		ctx->X[5] = X5 ^ w[5];
+		ctx->X[6] = X6 ^ w[6];
+		ctx->X[7] = X7 ^ w[7];
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+	}
+	while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein_512_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein_512_Process_Block);
+}
+
+uint_t
+Skein_512_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_512);
+}
+#endif
+#endif
+
+/*  Skein1024 */
+#if	!(SKEIN_USE_ASM & 1024)
+void
+Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
+    size_t blkCnt, size_t byteCntAdd)
+{
+	/* do it in C, always looping (unrolled is bigger AND slower!) */
+	enum {
+		WCNT = SKEIN1024_STATE_WORDS
+	};
+#undef  RCNT
+#define	RCNT  (SKEIN1024_ROUNDS_TOTAL/8)
+
+#ifdef	SKEIN_LOOP		/* configure how much to unroll the loop */
+#define	SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
+#else
+#define	SKEIN_UNROLL_1024 (0)
+#endif
+
+#if	(SKEIN_UNROLL_1024 != 0)
+#if	(RCNT % SKEIN_UNROLL_1024)
+#error "Invalid SKEIN_UNROLL_1024"	/* sanity check on unroll count */
+#endif
+	size_t r;
+	/* key schedule words : chaining vars + tweak + "rotation" */
+	uint64_t kw[WCNT + 4 + RCNT * 2];
+#else
+	uint64_t kw[WCNT + 4];	/* key schedule words : chaining vars + tweak */
+#endif
+
+	/* local copy of vars, for speed */
+	uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
+	    X12, X13, X14, X15;
+	uint64_t w[WCNT];		/* local copy of input block */
+#ifdef	SKEIN_DEBUG
+	/* use for debugging (help compiler put Xn in registers) */
+	const uint64_t *Xptr[16];
+	Xptr[0] = &X00;
+	Xptr[1] = &X01;
+	Xptr[2] = &X02;
+	Xptr[3] = &X03;
+	Xptr[4] = &X04;
+	Xptr[5] = &X05;
+	Xptr[6] = &X06;
+	Xptr[7] = &X07;
+	Xptr[8] = &X08;
+	Xptr[9] = &X09;
+	Xptr[10] = &X10;
+	Xptr[11] = &X11;
+	Xptr[12] = &X12;
+	Xptr[13] = &X13;
+	Xptr[14] = &X14;
+	Xptr[15] = &X15;
+#endif
+
+	Skein_assert(blkCnt != 0);	/* never call with blkCnt == 0! */
+	ts[0] = ctx->h.T[0];
+	ts[1] = ctx->h.T[1];
+	do {
+		/*
+		 * this implementation only supports 2**64 input bytes
+		 * (no carry out here)
+		 */
+		ts[0] += byteCntAdd;	/* update processed length */
+
+		/* precompute the key schedule for this block */
+		ks[0] = ctx->X[0];
+		ks[1] = ctx->X[1];
+		ks[2] = ctx->X[2];
+		ks[3] = ctx->X[3];
+		ks[4] = ctx->X[4];
+		ks[5] = ctx->X[5];
+		ks[6] = ctx->X[6];
+		ks[7] = ctx->X[7];
+		ks[8] = ctx->X[8];
+		ks[9] = ctx->X[9];
+		ks[10] = ctx->X[10];
+		ks[11] = ctx->X[11];
+		ks[12] = ctx->X[12];
+		ks[13] = ctx->X[13];
+		ks[14] = ctx->X[14];
+		ks[15] = ctx->X[15];
+		ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
+		    ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
+		    ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
+		    ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
+
+		ts[2] = ts[0] ^ ts[1];
+
+		/* get input block in little-endian format */
+		Skein_Get64_LSB_First(w, blkPtr, WCNT);
+		DebugSaveTweak(ctx);
+		Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
+
+		X00 = w[0] + ks[0];	/* do the first full key injection */
+		X01 = w[1] + ks[1];
+		X02 = w[2] + ks[2];
+		X03 = w[3] + ks[3];
+		X04 = w[4] + ks[4];
+		X05 = w[5] + ks[5];
+		X06 = w[6] + ks[6];
+		X07 = w[7] + ks[7];
+		X08 = w[8] + ks[8];
+		X09 = w[9] + ks[9];
+		X10 = w[10] + ks[10];
+		X11 = w[11] + ks[11];
+		X12 = w[12] + ks[12];
+		X13 = w[13] + ks[13] + ts[0];
+		X14 = w[14] + ks[14] + ts[1];
+		X15 = w[15] + ks[15];
+
+		Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
+		    Xptr);
+
+#define	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rNum)						\
+	X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
+	X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
+	X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
+	X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
+	X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
+	X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
+	X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
+	X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
+
+#if	SKEIN_UNROLL_1024 == 0
+#define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
+	pE, pF, ROT, rn)						\
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rn)						\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
+
+#define	I1024(R)							\
+	X00 += ks[((R) + 1) % 17];	/* inject the key schedule value */\
+	X01 += ks[((R) + 2) % 17];					\
+	X02 += ks[((R) + 3) % 17];					\
+	X03 += ks[((R) + 4) % 17];					\
+	X04 += ks[((R) + 5) % 17];					\
+	X05 += ks[((R) + 6) % 17];					\
+	X06 += ks[((R) + 7) % 17];					\
+	X07 += ks[((R) + 8) % 17];					\
+	X08 += ks[((R) + 9) % 17];					\
+	X09 += ks[((R) + 10) % 17];					\
+	X10 += ks[((R) + 11) % 17];					\
+	X11 += ks[((R) + 12) % 17];					\
+	X12 += ks[((R) + 13) % 17];					\
+	X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3];			\
+	X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3];			\
+	X15 += ks[((R) + 16) % 17] + (R) +1;				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+#else				/* looping version */
+#define	R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD,	\
+	pE, pF, ROT, rn)						\
+	Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC,	\
+	pD, pE, pF, ROT, rn)						\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
+
+#define	I1024(R)							\
+	X00 += ks[r + (R) + 0];	/* inject the key schedule value */	\
+	X01 += ks[r + (R) + 1];						\
+	X02 += ks[r + (R) + 2];						\
+	X03 += ks[r + (R) + 3];						\
+	X04 += ks[r + (R) + 4];						\
+	X05 += ks[r + (R) + 5];						\
+	X06 += ks[r + (R) + 6];						\
+	X07 += ks[r + (R) + 7];						\
+	X08 += ks[r + (R) + 8];						\
+	X09 += ks[r + (R) + 9];						\
+	X10 += ks[r + (R) + 10];					\
+	X11 += ks[r + (R) + 11];					\
+	X12 += ks[r + (R) + 12];					\
+	X13 += ks[r + (R) + 13] + ts[r + (R) + 0];			\
+	X14 += ks[r + (R) + 14] + ts[r + (R) + 1];			\
+	X15 += ks[r + (R) + 15] +  r + (R);				\
+	ks[r + (R) + 16] = ks[r + (R) - 1];	/* rotate key schedule */\
+	ts[r + (R) + 2] = ts[r + (R) - 1];				\
+	Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
+
+		/* loop thru it */
+		for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
+#endif
+		{
+#define	R1024_8_rounds(R)	/* do 8 full rounds */			\
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
+	    14, 15, R1024_0, 8 * (R) + 1);				\
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
+	    08, 01, R1024_1, 8 * (R) + 2);				\
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
+	    10, 09, R1024_2, 8 * (R) + 3);				\
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
+	    12, 07, R1024_3, 8 * (R) + 4);				\
+	I1024(2 * (R));							\
+	R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13,	\
+	    14, 15, R1024_4, 8 * (R) + 5);				\
+	R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05,	\
+	    08, 01, R1024_5, 8 * (R) + 6);				\
+	R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11,	\
+	    10, 09, R1024_6, 8 * (R) + 7);				\
+	R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03,	\
+	    12, 07, R1024_7, 8 * (R) + 8);				\
+	I1024(2 * (R) + 1);
+
+			R1024_8_rounds(0);
+
+#define	R1024_Unroll_R(NN)						\
+	((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) ||	\
+	(SKEIN_UNROLL_1024 > (NN)))
+
+#if	R1024_Unroll_R(1)
+			R1024_8_rounds(1);
+#endif
+#if	R1024_Unroll_R(2)
+			R1024_8_rounds(2);
+#endif
+#if	R1024_Unroll_R(3)
+			R1024_8_rounds(3);
+#endif
+#if	R1024_Unroll_R(4)
+			R1024_8_rounds(4);
+#endif
+#if	R1024_Unroll_R(5)
+			R1024_8_rounds(5);
+#endif
+#if	R1024_Unroll_R(6)
+			R1024_8_rounds(6);
+#endif
+#if	R1024_Unroll_R(7)
+			R1024_8_rounds(7);
+#endif
+#if	R1024_Unroll_R(8)
+			R1024_8_rounds(8);
+#endif
+#if	R1024_Unroll_R(9)
+			R1024_8_rounds(9);
+#endif
+#if	R1024_Unroll_R(10)
+			R1024_8_rounds(10);
+#endif
+#if	R1024_Unroll_R(11)
+			R1024_8_rounds(11);
+#endif
+#if	R1024_Unroll_R(12)
+			R1024_8_rounds(12);
+#endif
+#if	R1024_Unroll_R(13)
+			R1024_8_rounds(13);
+#endif
+#if	R1024_Unroll_R(14)
+			R1024_8_rounds(14);
+#endif
+#if	(SKEIN_UNROLL_1024 > 14)
+#error  "need more unrolling in Skein_1024_Process_Block"
+#endif
+		}
+		/*
+		 * do the final "feedforward" xor, update context chaining vars
+		 */
+
+		ctx->X[0] = X00 ^ w[0];
+		ctx->X[1] = X01 ^ w[1];
+		ctx->X[2] = X02 ^ w[2];
+		ctx->X[3] = X03 ^ w[3];
+		ctx->X[4] = X04 ^ w[4];
+		ctx->X[5] = X05 ^ w[5];
+		ctx->X[6] = X06 ^ w[6];
+		ctx->X[7] = X07 ^ w[7];
+		ctx->X[8] = X08 ^ w[8];
+		ctx->X[9] = X09 ^ w[9];
+		ctx->X[10] = X10 ^ w[10];
+		ctx->X[11] = X11 ^ w[11];
+		ctx->X[12] = X12 ^ w[12];
+		ctx->X[13] = X13 ^ w[13];
+		ctx->X[14] = X14 ^ w[14];
+		ctx->X[15] = X15 ^ w[15];
+
+		Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
+
+		ts[1] &= ~SKEIN_T1_FLAG_FIRST;
+		blkPtr += SKEIN1024_BLOCK_BYTES;
+	} while (--blkCnt);
+	ctx->h.T[0] = ts[0];
+	ctx->h.T[1] = ts[1];
+}
+
+#if	defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
+size_t
+Skein1024_Process_Block_CodeSize(void)
+{
+	return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
+	    ((uint8_t *)Skein1024_Process_Block);
+}
+
+uint_t
+Skein1024_Unroll_Cnt(void)
+{
+	return (SKEIN_UNROLL_1024);
+}
+#endif
+#endif
diff --git a/module/icp/algs/skein/skein_impl.h b/module/icp/algs/skein/skein_impl.h
new file mode 100644
index 000000000..e83a06971
--- /dev/null
+++ b/module/icp/algs/skein/skein_impl.h
@@ -0,0 +1,289 @@
+/*
+ * Internal definitions for Skein hashing.
+ * Source code author: Doug Whiting, 2008.
+ * This algorithm and source code is released to the public domain.
+ *
+ * The following compile-time switches may be defined to control some
+ * tradeoffs between speed, code size, error checking, and security.
+ *
+ * The "default" note explains what happens when the switch is not defined.
+ *
+ *  SKEIN_DEBUG            -- make callouts from inside Skein code
+ *                            to examine/display intermediate values.
+ *                            [default: no callouts (no overhead)]
+ *
+ *  SKEIN_ERR_CHECK        -- how error checking is handled inside Skein
+ *                            code. If not defined, most error checking
+ *                            is disabled (for performance). Otherwise,
+ *                            the switch value is interpreted as:
+ *                                0: use assert()      to flag errors
+ *                                1: return SKEIN_FAIL to flag errors
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef	_SKEIN_IMPL_H_
+#define	_SKEIN_IMPL_H_
+
+#include <sys/skein.h>
+#include "skein_impl.h"
+#include "skein_port.h"
+
+/* determine where we can get bcopy/bzero declarations */
+#ifdef	_KERNEL
+#include <sys/systm.h>
+#else
+#include <strings.h>
+#endif
+
+/*
+ * "Internal" Skein definitions
+ *    -- not needed for sequential hashing API, but will be
+ *           helpful for other uses of Skein (e.g., tree hash mode).
+ *    -- included here so that they can be shared between
+ *           reference and optimized code.
+ */
+
+/* tweak word T[1]: bit field starting positions */
+/* offset 64 because it's the second word  */
+#define	SKEIN_T1_BIT(BIT)	((BIT) - 64)
+
+/* bits 112..118: level in hash tree */
+#define	SKEIN_T1_POS_TREE_LVL	SKEIN_T1_BIT(112)
+/* bit  119: partial final input byte */
+#define	SKEIN_T1_POS_BIT_PAD	SKEIN_T1_BIT(119)
+/* bits 120..125: type field */
+#define	SKEIN_T1_POS_BLK_TYPE	SKEIN_T1_BIT(120)
+/* bits 126: first block flag */
+#define	SKEIN_T1_POS_FIRST	SKEIN_T1_BIT(126)
+/* bit  127: final block flag */
+#define	SKEIN_T1_POS_FINAL	SKEIN_T1_BIT(127)
+
+/* tweak word T[1]: flag bit definition(s) */
+#define	SKEIN_T1_FLAG_FIRST	(((uint64_t)1) << SKEIN_T1_POS_FIRST)
+#define	SKEIN_T1_FLAG_FINAL	(((uint64_t)1) << SKEIN_T1_POS_FINAL)
+#define	SKEIN_T1_FLAG_BIT_PAD	(((uint64_t)1) << SKEIN_T1_POS_BIT_PAD)
+
+/* tweak word T[1]: tree level bit field mask */
+#define	SKEIN_T1_TREE_LVL_MASK	(((uint64_t)0x7F) << SKEIN_T1_POS_TREE_LVL)
+#define	SKEIN_T1_TREE_LEVEL(n)	(((uint64_t)(n)) << SKEIN_T1_POS_TREE_LVL)
+
+/* tweak word T[1]: block type field */
+#define	SKEIN_BLK_TYPE_KEY	(0)	/* key, for MAC and KDF */
+#define	SKEIN_BLK_TYPE_CFG	(4)	/* configuration block */
+#define	SKEIN_BLK_TYPE_PERS	(8)	/* personalization string */
+#define	SKEIN_BLK_TYPE_PK	(12)	/* public key (for signature hashing) */
+#define	SKEIN_BLK_TYPE_KDF	(16)	/* key identifier for KDF */
+#define	SKEIN_BLK_TYPE_NONCE	(20)	/* nonce for PRNG */
+#define	SKEIN_BLK_TYPE_MSG	(48)	/* message processing */
+#define	SKEIN_BLK_TYPE_OUT	(63)	/* output stage */
+#define	SKEIN_BLK_TYPE_MASK	(63)	/* bit field mask */
+
+#define	SKEIN_T1_BLK_TYPE(T)	\
+	(((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE)
+/* key, for MAC and KDF */
+#define	SKEIN_T1_BLK_TYPE_KEY	SKEIN_T1_BLK_TYPE(KEY)
+/* configuration block */
+#define	SKEIN_T1_BLK_TYPE_CFG	SKEIN_T1_BLK_TYPE(CFG)
+/* personalization string */
+#define	SKEIN_T1_BLK_TYPE_PERS	SKEIN_T1_BLK_TYPE(PERS)
+/* public key (for digital signature hashing) */
+#define	SKEIN_T1_BLK_TYPE_PK	SKEIN_T1_BLK_TYPE(PK)
+/* key identifier for KDF */
+#define	SKEIN_T1_BLK_TYPE_KDF	SKEIN_T1_BLK_TYPE(KDF)
+/* nonce for PRNG */
+#define	SKEIN_T1_BLK_TYPE_NONCE	SKEIN_T1_BLK_TYPE(NONCE)
+/* message processing */
+#define	SKEIN_T1_BLK_TYPE_MSG	SKEIN_T1_BLK_TYPE(MSG)
+/* output stage */
+#define	SKEIN_T1_BLK_TYPE_OUT	SKEIN_T1_BLK_TYPE(OUT)
+/* field bit mask */
+#define	SKEIN_T1_BLK_TYPE_MASK	SKEIN_T1_BLK_TYPE(MASK)
+
+#define	SKEIN_T1_BLK_TYPE_CFG_FINAL	\
+	(SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL)
+#define	SKEIN_T1_BLK_TYPE_OUT_FINAL	\
+	(SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL)
+
+#define	SKEIN_VERSION		(1)
+
+#ifndef	SKEIN_ID_STRING_LE	/* allow compile-time personalization */
+#define	SKEIN_ID_STRING_LE	(0x33414853)	/* "SHA3" (little-endian) */
+#endif
+
+#define	SKEIN_MK_64(hi32, lo32)	((lo32) + (((uint64_t)(hi32)) << 32))
+#define	SKEIN_SCHEMA_VER	SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE)
+#define	SKEIN_KS_PARITY		SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22)
+
+#define	SKEIN_CFG_STR_LEN	(4*8)
+
+/* bit field definitions in config block treeInfo word */
+#define	SKEIN_CFG_TREE_LEAF_SIZE_POS	(0)
+#define	SKEIN_CFG_TREE_NODE_SIZE_POS	(8)
+#define	SKEIN_CFG_TREE_MAX_LEVEL_POS	(16)
+
+#define	SKEIN_CFG_TREE_LEAF_SIZE_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS)
+#define	SKEIN_CFG_TREE_NODE_SIZE_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS)
+#define	SKEIN_CFG_TREE_MAX_LEVEL_MSK	\
+	(((uint64_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS)
+
+#define	SKEIN_CFG_TREE_INFO(leaf, node, maxLvl)			\
+	((((uint64_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) |	\
+	(((uint64_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) |	\
+	(((uint64_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS))
+
+/* use as treeInfo in InitExt() call for sequential processing */
+#define	SKEIN_CFG_TREE_INFO_SEQUENTIAL	SKEIN_CFG_TREE_INFO(0, 0, 0)
+
+/*
+ * Skein macros for getting/setting tweak words, etc.
+ * These are useful for partial input bytes, hash tree init/update, etc.
+ */
+#define	Skein_Get_Tweak(ctxPtr, TWK_NUM)	((ctxPtr)->h.T[TWK_NUM])
+#define	Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal)		\
+	do {						\
+		(ctxPtr)->h.T[TWK_NUM] = (tVal);	\
+		_NOTE(CONSTCOND)			\
+	} while (0)
+
+#define	Skein_Get_T0(ctxPtr)		Skein_Get_Tweak(ctxPtr, 0)
+#define	Skein_Get_T1(ctxPtr)		Skein_Get_Tweak(ctxPtr, 1)
+#define	Skein_Set_T0(ctxPtr, T0)	Skein_Set_Tweak(ctxPtr, 0, T0)
+#define	Skein_Set_T1(ctxPtr, T1)	Skein_Set_Tweak(ctxPtr, 1, T1)
+
+/* set both tweak words at once */
+#define	Skein_Set_T0_T1(ctxPtr, T0, T1)		\
+	do {					\
+		Skein_Set_T0(ctxPtr, (T0));	\
+		Skein_Set_T1(ctxPtr, (T1));	\
+		_NOTE(CONSTCOND)		\
+	} while (0)
+
+#define	Skein_Set_Type(ctxPtr, BLK_TYPE)	\
+	Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE)
+
+/*
+ * set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0;
+ */
+#define	Skein_Start_New_Type(ctxPtr, BLK_TYPE)				\
+	do {								\
+		Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST |	\
+		    SKEIN_T1_BLK_TYPE_ ## BLK_TYPE);			\
+		(ctxPtr)->h.bCnt = 0;	\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+#define	Skein_Clear_First_Flag(hdr)					\
+	do {								\
+		(hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST;			\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+#define	Skein_Set_Bit_Pad_Flag(hdr)					\
+	do {								\
+		(hdr).T[1] |=  SKEIN_T1_FLAG_BIT_PAD;			\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+#define	Skein_Set_Tree_Level(hdr, height)				\
+	do {								\
+		(hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height);		\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+/*
+ * "Internal" Skein definitions for debugging and error checking
+ * Note: in Illumos we always disable debugging features.
+ */
+#define	Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr)
+#define	Skein_Show_Round(bits, ctx, r, X)
+#define	Skein_Show_R_Ptr(bits, ctx, r, X_ptr)
+#define	Skein_Show_Final(bits, ctx, cnt, outPtr)
+#define	Skein_Show_Key(bits, ctx, key, keyBytes)
+
+/* run-time checks (e.g., bad params, uninitialized context)? */
+#ifndef	SKEIN_ERR_CHECK
+/* default: ignore all Asserts, for performance */
+#define	Skein_Assert(x, retCode)
+#define	Skein_assert(x)
+#elif	defined(SKEIN_ASSERT)
+#include <sys/debug.h>
+#define	Skein_Assert(x, retCode)	ASSERT(x)
+#define	Skein_assert(x)			ASSERT(x)
+#else
+#include <sys/debug.h>
+/*  caller error */
+#define	Skein_Assert(x, retCode)		\
+	do {					\
+		if (!(x))			\
+			return (retCode);	\
+		_NOTE(CONSTCOND)		\
+	} while (0)
+/* internal error */
+#define	Skein_assert(x)	ASSERT(x)
+#endif
+
+/*
+ * Skein block function constants (shared across Ref and Opt code)
+ */
+enum {
+	/* Skein_256 round rotation constants */
+	R_256_0_0 = 14, R_256_0_1 = 16,
+	R_256_1_0 = 52, R_256_1_1 = 57,
+	R_256_2_0 = 23, R_256_2_1 = 40,
+	R_256_3_0 = 5, R_256_3_1 = 37,
+	R_256_4_0 = 25, R_256_4_1 = 33,
+	R_256_5_0 = 46, R_256_5_1 = 12,
+	R_256_6_0 = 58, R_256_6_1 = 22,
+	R_256_7_0 = 32, R_256_7_1 = 32,
+
+	/* Skein_512 round rotation constants */
+	R_512_0_0 = 46, R_512_0_1 = 36, R_512_0_2 = 19, R_512_0_3 = 37,
+	R_512_1_0 = 33, R_512_1_1 = 27, R_512_1_2 = 14, R_512_1_3 = 42,
+	R_512_2_0 = 17, R_512_2_1 = 49, R_512_2_2 = 36, R_512_2_3 = 39,
+	R_512_3_0 = 44, R_512_3_1 = 9, R_512_3_2 = 54, R_512_3_3 = 56,
+	R_512_4_0 = 39, R_512_4_1 = 30, R_512_4_2 = 34, R_512_4_3 = 24,
+	R_512_5_0 = 13, R_512_5_1 = 50, R_512_5_2 = 10, R_512_5_3 = 17,
+	R_512_6_0 = 25, R_512_6_1 = 29, R_512_6_2 = 39, R_512_6_3 = 43,
+	R_512_7_0 = 8, R_512_7_1 = 35, R_512_7_2 = 56, R_512_7_3 = 22,
+
+	/* Skein1024 round rotation constants */
+	R1024_0_0 = 24, R1024_0_1 = 13, R1024_0_2 = 8, R1024_0_3 =
+	    47, R1024_0_4 = 8, R1024_0_5 = 17, R1024_0_6 = 22, R1024_0_7 = 37,
+	R1024_1_0 = 38, R1024_1_1 = 19, R1024_1_2 = 10, R1024_1_3 =
+	    55, R1024_1_4 = 49, R1024_1_5 = 18, R1024_1_6 = 23, R1024_1_7 = 52,
+	R1024_2_0 = 33, R1024_2_1 = 4, R1024_2_2 = 51, R1024_2_3 =
+	    13, R1024_2_4 = 34, R1024_2_5 = 41, R1024_2_6 = 59, R1024_2_7 = 17,
+	R1024_3_0 = 5, R1024_3_1 = 20, R1024_3_2 = 48, R1024_3_3 =
+	    41, R1024_3_4 = 47, R1024_3_5 = 28, R1024_3_6 = 16, R1024_3_7 = 25,
+	R1024_4_0 = 41, R1024_4_1 = 9, R1024_4_2 = 37, R1024_4_3 =
+	    31, R1024_4_4 = 12, R1024_4_5 = 47, R1024_4_6 = 44, R1024_4_7 = 30,
+	R1024_5_0 = 16, R1024_5_1 = 34, R1024_5_2 = 56, R1024_5_3 =
+	    51, R1024_5_4 = 4, R1024_5_5 = 53, R1024_5_6 = 42, R1024_5_7 = 41,
+	R1024_6_0 = 31, R1024_6_1 = 44, R1024_6_2 = 47, R1024_6_3 =
+	    46, R1024_6_4 = 19, R1024_6_5 = 42, R1024_6_6 = 44, R1024_6_7 = 25,
+	R1024_7_0 = 9, R1024_7_1 = 48, R1024_7_2 = 35, R1024_7_3 =
+	    52, R1024_7_4 = 23, R1024_7_5 = 31, R1024_7_6 = 37, R1024_7_7 = 20
+};
+
+/* number of rounds for the different block sizes */
+#define	SKEIN_256_ROUNDS_TOTAL	(72)
+#define	SKEIN_512_ROUNDS_TOTAL	(72)
+#define	SKEIN1024_ROUNDS_TOTAL	(80)
+
+
+extern const uint64_t SKEIN_256_IV_128[];
+extern const uint64_t SKEIN_256_IV_160[];
+extern const uint64_t SKEIN_256_IV_224[];
+extern const uint64_t SKEIN_256_IV_256[];
+extern const uint64_t SKEIN_512_IV_128[];
+extern const uint64_t SKEIN_512_IV_160[];
+extern const uint64_t SKEIN_512_IV_224[];
+extern const uint64_t SKEIN_512_IV_256[];
+extern const uint64_t SKEIN_512_IV_384[];
+extern const uint64_t SKEIN_512_IV_512[];
+extern const uint64_t SKEIN1024_IV_384[];
+extern const uint64_t SKEIN1024_IV_512[];
+extern const uint64_t SKEIN1024_IV_1024[];
+
+#endif	/* _SKEIN_IMPL_H_ */
diff --git a/module/icp/algs/skein/skein_iv.c b/module/icp/algs/skein/skein_iv.c
new file mode 100644
index 000000000..140d38f76
--- /dev/null
+++ b/module/icp/algs/skein/skein_iv.c
@@ -0,0 +1,185 @@
+/*
+ * Pre-computed Skein IVs
+ *
+ * NOTE: these values are not "magic" constants, but
+ * are generated using the Threefish block function.
+ * They are pre-computed here only for speed; i.e., to
+ * avoid the need for a Threefish call during Init().
+ *
+ * The IV for any fixed hash length may be pre-computed.
+ * Only the most common values are included here.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+/*
+ * Illumos implementation note: these constants are for Skein v1.3 as per:
+ * http://www.skein-hash.info/sites/default/files/skein1.3.pdf
+ */
+
+#include <sys/skein.h>		/* get Skein macros and types */
+#include "skein_impl.h"		/* get internal definitions */
+
+#define	MK_64 SKEIN_MK_64
+
+/* blkSize =  256 bits. hashSize =  128 bits */
+const uint64_t SKEIN_256_IV_128[] = {
+	MK_64(0xE1111906, 0x964D7260),
+	MK_64(0x883DAAA7, 0x7C8D811C),
+	MK_64(0x10080DF4, 0x91960F7A),
+	MK_64(0xCCF7DDE5, 0xB45BC1C2)
+};
+
+/* blkSize =  256 bits. hashSize =  160 bits */
+const uint64_t SKEIN_256_IV_160[] = {
+	MK_64(0x14202314, 0x72825E98),
+	MK_64(0x2AC4E9A2, 0x5A77E590),
+	MK_64(0xD47A5856, 0x8838D63E),
+	MK_64(0x2DD2E496, 0x8586AB7D)
+};
+
+/* blkSize =  256 bits. hashSize =  224 bits */
+const uint64_t SKEIN_256_IV_224[] = {
+	MK_64(0xC6098A8C, 0x9AE5EA0B),
+	MK_64(0x876D5686, 0x08C5191C),
+	MK_64(0x99CB88D7, 0xD7F53884),
+	MK_64(0x384BDDB1, 0xAEDDB5DE)
+};
+
+/* blkSize =  256 bits. hashSize =  256 bits */
+const uint64_t SKEIN_256_IV_256[] = {
+	MK_64(0xFC9DA860, 0xD048B449),
+	MK_64(0x2FCA6647, 0x9FA7D833),
+	MK_64(0xB33BC389, 0x6656840F),
+	MK_64(0x6A54E920, 0xFDE8DA69)
+};
+
+/* blkSize =  512 bits. hashSize =  128 bits */
+const uint64_t SKEIN_512_IV_128[] = {
+	MK_64(0xA8BC7BF3, 0x6FBF9F52),
+	MK_64(0x1E9872CE, 0xBD1AF0AA),
+	MK_64(0x309B1790, 0xB32190D3),
+	MK_64(0xBCFBB854, 0x3F94805C),
+	MK_64(0x0DA61BCD, 0x6E31B11B),
+	MK_64(0x1A18EBEA, 0xD46A32E3),
+	MK_64(0xA2CC5B18, 0xCE84AA82),
+	MK_64(0x6982AB28, 0x9D46982D)
+};
+
+/* blkSize =  512 bits. hashSize =  160 bits */
+const uint64_t SKEIN_512_IV_160[] = {
+	MK_64(0x28B81A2A, 0xE013BD91),
+	MK_64(0xC2F11668, 0xB5BDF78F),
+	MK_64(0x1760D8F3, 0xF6A56F12),
+	MK_64(0x4FB74758, 0x8239904F),
+	MK_64(0x21EDE07F, 0x7EAF5056),
+	MK_64(0xD908922E, 0x63ED70B8),
+	MK_64(0xB8EC76FF, 0xECCB52FA),
+	MK_64(0x01A47BB8, 0xA3F27A6E)
+};
+
+/* blkSize =  512 bits. hashSize =  224 bits */
+const uint64_t SKEIN_512_IV_224[] = {
+	MK_64(0xCCD06162, 0x48677224),
+	MK_64(0xCBA65CF3, 0xA92339EF),
+	MK_64(0x8CCD69D6, 0x52FF4B64),
+	MK_64(0x398AED7B, 0x3AB890B4),
+	MK_64(0x0F59D1B1, 0x457D2BD0),
+	MK_64(0x6776FE65, 0x75D4EB3D),
+	MK_64(0x99FBC70E, 0x997413E9),
+	MK_64(0x9E2CFCCF, 0xE1C41EF7)
+};
+
+/* blkSize =  512 bits. hashSize =  256 bits */
+const uint64_t SKEIN_512_IV_256[] = {
+	MK_64(0xCCD044A1, 0x2FDB3E13),
+	MK_64(0xE8359030, 0x1A79A9EB),
+	MK_64(0x55AEA061, 0x4F816E6F),
+	MK_64(0x2A2767A4, 0xAE9B94DB),
+	MK_64(0xEC06025E, 0x74DD7683),
+	MK_64(0xE7A436CD, 0xC4746251),
+	MK_64(0xC36FBAF9, 0x393AD185),
+	MK_64(0x3EEDBA18, 0x33EDFC13)
+};
+
+/* blkSize =  512 bits. hashSize =  384 bits */
+const uint64_t SKEIN_512_IV_384[] = {
+	MK_64(0xA3F6C6BF, 0x3A75EF5F),
+	MK_64(0xB0FEF9CC, 0xFD84FAA4),
+	MK_64(0x9D77DD66, 0x3D770CFE),
+	MK_64(0xD798CBF3, 0xB468FDDA),
+	MK_64(0x1BC4A666, 0x8A0E4465),
+	MK_64(0x7ED7D434, 0xE5807407),
+	MK_64(0x548FC1AC, 0xD4EC44D6),
+	MK_64(0x266E1754, 0x6AA18FF8)
+};
+
+/* blkSize =  512 bits. hashSize =  512 bits */
+const uint64_t SKEIN_512_IV_512[] = {
+	MK_64(0x4903ADFF, 0x749C51CE),
+	MK_64(0x0D95DE39, 0x9746DF03),
+	MK_64(0x8FD19341, 0x27C79BCE),
+	MK_64(0x9A255629, 0xFF352CB1),
+	MK_64(0x5DB62599, 0xDF6CA7B0),
+	MK_64(0xEABE394C, 0xA9D5C3F4),
+	MK_64(0x991112C7, 0x1A75B523),
+	MK_64(0xAE18A40B, 0x660FCC33)
+};
+
+/* blkSize = 1024 bits. hashSize =  384 bits */
+const uint64_t SKEIN1024_IV_384[] = {
+	MK_64(0x5102B6B8, 0xC1894A35),
+	MK_64(0xFEEBC9E3, 0xFE8AF11A),
+	MK_64(0x0C807F06, 0xE32BED71),
+	MK_64(0x60C13A52, 0xB41A91F6),
+	MK_64(0x9716D35D, 0xD4917C38),
+	MK_64(0xE780DF12, 0x6FD31D3A),
+	MK_64(0x797846B6, 0xC898303A),
+	MK_64(0xB172C2A8, 0xB3572A3B),
+	MK_64(0xC9BC8203, 0xA6104A6C),
+	MK_64(0x65909338, 0xD75624F4),
+	MK_64(0x94BCC568, 0x4B3F81A0),
+	MK_64(0x3EBBF51E, 0x10ECFD46),
+	MK_64(0x2DF50F0B, 0xEEB08542),
+	MK_64(0x3B5A6530, 0x0DBC6516),
+	MK_64(0x484B9CD2, 0x167BBCE1),
+	MK_64(0x2D136947, 0xD4CBAFEA)
+};
+
+/* blkSize = 1024 bits. hashSize =  512 bits */
+const uint64_t SKEIN1024_IV_512[] = {
+	MK_64(0xCAEC0E5D, 0x7C1B1B18),
+	MK_64(0xA01B0E04, 0x5F03E802),
+	MK_64(0x33840451, 0xED912885),
+	MK_64(0x374AFB04, 0xEAEC2E1C),
+	MK_64(0xDF25A0E2, 0x813581F7),
+	MK_64(0xE4004093, 0x8B12F9D2),
+	MK_64(0xA662D539, 0xC2ED39B6),
+	MK_64(0xFA8B85CF, 0x45D8C75A),
+	MK_64(0x8316ED8E, 0x29EDE796),
+	MK_64(0x053289C0, 0x2E9F91B8),
+	MK_64(0xC3F8EF1D, 0x6D518B73),
+	MK_64(0xBDCEC3C4, 0xD5EF332E),
+	MK_64(0x549A7E52, 0x22974487),
+	MK_64(0x67070872, 0x5B749816),
+	MK_64(0xB9CD28FB, 0xF0581BD1),
+	MK_64(0x0E2940B8, 0x15804974)
+};
+
+/* blkSize = 1024 bits. hashSize = 1024 bits */
+const uint64_t SKEIN1024_IV_1024[] = {
+	MK_64(0xD593DA07, 0x41E72355),
+	MK_64(0x15B5E511, 0xAC73E00C),
+	MK_64(0x5180E5AE, 0xBAF2C4F0),
+	MK_64(0x03BD41D3, 0xFCBCAFAF),
+	MK_64(0x1CAEC6FD, 0x1983A898),
+	MK_64(0x6E510B8B, 0xCDD0589F),
+	MK_64(0x77E2BDFD, 0xC6394ADA),
+	MK_64(0xC11E1DB5, 0x24DCB0A3),
+	MK_64(0xD6D14AF9, 0xC6329AB5),
+	MK_64(0x6A9B0BFC, 0x6EB67E0D),
+	MK_64(0x9243C60D, 0xCCFF1332),
+	MK_64(0x1A1F1DDE, 0x743F02D4),
+	MK_64(0x0996753C, 0x10ED0BB8),
+	MK_64(0x6572DD22, 0xF2B4969A),
+	MK_64(0x61FD3062, 0xD00A579A),
+	MK_64(0x1DE0536E, 0x8682E539)
+};
diff --git a/module/icp/algs/skein/skein_port.h b/module/icp/algs/skein/skein_port.h
new file mode 100644
index 000000000..1b0225236
--- /dev/null
+++ b/module/icp/algs/skein/skein_port.h
@@ -0,0 +1,128 @@
+/*
+ * Platform-specific definitions for Skein hash function.
+ *
+ * Source code author: Doug Whiting, 2008.
+ *
+ * This algorithm and source code is released to the public domain.
+ *
+ * Many thanks to Brian Gladman for his portable header files.
+ *
+ * To port Skein to an "unsupported" platform, change the definitions
+ * in this file appropriately.
+ */
+/* Copyright 2013 Doug Whiting. This code is released to the public domain. */
+
+#ifndef	_SKEIN_PORT_H_
+#define	_SKEIN_PORT_H_
+
+#include <sys/types.h>	/* get integer type definitions */
+#include <sys/systm.h>	/* for bcopy() */
+
+#ifndef	RotL_64
+#define	RotL_64(x, N)	(((x) << (N)) | ((x) >> (64 - (N))))
+#endif
+
+/*
+ * Skein is "natively" little-endian (unlike SHA-xxx), for optimal
+ * performance on x86 CPUs. The Skein code requires the following
+ * definitions for dealing with endianness:
+ *
+ *    SKEIN_NEED_SWAP:  0 for little-endian, 1 for big-endian
+ *    Skein_Put64_LSB_First
+ *    Skein_Get64_LSB_First
+ *    Skein_Swap64
+ *
+ * If SKEIN_NEED_SWAP is defined at compile time, it is used here
+ * along with the portable versions of Put64/Get64/Swap64, which
+ * are slow in general.
+ *
+ * Otherwise, an "auto-detect" of endianness is attempted below.
+ * If the default handling doesn't work well, the user may insert
+ * platform-specific code instead (e.g., for big-endian CPUs).
+ *
+ */
+#ifndef	SKEIN_NEED_SWAP		/* compile-time "override" for endianness? */
+
+#include <sys/isa_defs.h>	/* get endianness selection */
+
+#define	PLATFORM_MUST_ALIGN	_ALIGNMENT_REQUIRED
+#if	defined(_BIG_ENDIAN)
+/* here for big-endian CPUs */
+#define	SKEIN_NEED_SWAP   (1)
+#else
+/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */
+#define	SKEIN_NEED_SWAP   (0)
+#if	PLATFORM_MUST_ALIGN == 0	/* ok to use "fast" versions? */
+#define	Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt)
+#define	Skein_Get64_LSB_First(dst64, src08, wCnt) \
+	bcopy(src08, dst64, 8 * (wCnt))
+#endif
+#endif
+
+#endif				/* ifndef SKEIN_NEED_SWAP */
+
+/*
+ * Provide any definitions still needed.
+ */
+#ifndef	Skein_Swap64	/* swap for big-endian, nop for little-endian */
+#if	SKEIN_NEED_SWAP
+#define	Skein_Swap64(w64)				\
+	(((((uint64_t)(w64)) & 0xFF) << 56) |		\
+	(((((uint64_t)(w64)) >> 8) & 0xFF) << 48) |	\
+	(((((uint64_t)(w64)) >> 16) & 0xFF) << 40) |	\
+	(((((uint64_t)(w64)) >> 24) & 0xFF) << 32) |	\
+	(((((uint64_t)(w64)) >> 32) & 0xFF) << 24) |	\
+	(((((uint64_t)(w64)) >> 40) & 0xFF) << 16) |	\
+	(((((uint64_t)(w64)) >> 48) & 0xFF) << 8) |	\
+	(((((uint64_t)(w64)) >> 56) & 0xFF)))
+#else
+#define	Skein_Swap64(w64)  (w64)
+#endif
+#endif				/* ifndef Skein_Swap64 */
+
+#ifndef	Skein_Put64_LSB_First
+void
+Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt)
+#ifdef	SKEIN_PORT_CODE		/* instantiate the function code here? */
+{
+	/*
+	 * this version is fully portable (big-endian or little-endian),
+	 * but slow
+	 */
+	size_t n;
+
+	for (n = 0; n < bCnt; n++)
+		dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7)));
+}
+#else
+;				/* output only the function prototype */
+#endif
+#endif				/* ifndef Skein_Put64_LSB_First */
+
+#ifndef	Skein_Get64_LSB_First
+void
+Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt)
+#ifdef	SKEIN_PORT_CODE		/* instantiate the function code here? */
+{
+	/*
+	 * this version is fully portable (big-endian or little-endian),
+	 * but slow
+	 */
+	size_t n;
+
+	for (n = 0; n < 8 * wCnt; n += 8)
+		dst[n / 8] = (((uint64_t)src[n])) +
+		    (((uint64_t)src[n + 1]) << 8) +
+		    (((uint64_t)src[n + 2]) << 16) +
+		    (((uint64_t)src[n + 3]) << 24) +
+		    (((uint64_t)src[n + 4]) << 32) +
+		    (((uint64_t)src[n + 5]) << 40) +
+		    (((uint64_t)src[n + 6]) << 48) +
+		    (((uint64_t)src[n + 7]) << 56);
+}
+#else
+;				/* output only the function prototype */
+#endif
+#endif				/* ifndef Skein_Get64_LSB_First */
+
+#endif	/* _SKEIN_PORT_H_ */
diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S
index b689c9022..d55c5eb48 100644
--- a/module/icp/asm-x86_64/sha2/sha256_impl.S
+++ b/module/icp/asm-x86_64/sha2/sha256_impl.S
@@ -62,11 +62,9 @@
  */
 
 /*
- * This file was generated by a perl script (sha512-x86_64.pl) that could
- * be used to generate sha256 and sha512 variants from the same code base.
- * For our purposes, we only need sha256 and so getting the perl script to
- * run as part of the build process seemed superfluous. The comments from
- * the original file have been pasted above.
+ * This file was generated by a perl script (sha512-x86_64.pl) that were 
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
  */
 
 #if defined(lint) || defined(__lint)
diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S
new file mode 100644
index 000000000..24a41745b
--- /dev/null
+++ b/module/icp/asm-x86_64/sha2/sha512_impl.S
@@ -0,0 +1,2083 @@
+/*
+ * ====================================================================
+ * Written by Andy Polyakov <[email protected]> for the OpenSSL
+ * project. Rights for redistribution and usage in source and binary
+ * forms are granted according to the OpenSSL license.
+ * ====================================================================
+ *
+ * sha256/512_block procedure for x86_64.
+ *
+ * 40% improvement over compiler-generated code on Opteron. On EM64T
+ * sha256 was observed to run >80% faster and sha512 - >40%. No magical
+ * tricks, just straight implementation... I really wonder why gcc
+ * [being armed with inline assembler] fails to generate as fast code.
+ * The only thing which is cool about this module is that it's very
+ * same instruction sequence used for both SHA-256 and SHA-512. In
+ * former case the instructions operate on 32-bit operands, while in
+ * latter - on 64-bit ones. All I had to do is to get one flavor right,
+ * the other one passed the test right away:-)
+ *
+ * sha256_block runs in ~1005 cycles on Opteron, which gives you
+ * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
+ * frequency in GHz. sha512_block runs in ~1275 cycles, which results
+ * in 128*1000/1275=100MBps per GHz. Is there room for improvement?
+ * Well, if you compare it to IA-64 implementation, which maintains
+ * X[16] in register bank[!], tends to 4 instructions per CPU clock
+ * cycle and runs in 1003 cycles, 1275 is very good result for 3-way
+ * issue Opteron pipeline and X[16] maintained in memory. So that *if*
+ * there is a way to improve it, *then* the only way would be to try to
+ * offload X[16] updates to SSE unit, but that would require "deeper"
+ * loop unroll, which in turn would naturally cause size blow-up, not
+ * to mention increased complexity! And once again, only *if* it's
+ * actually possible to noticeably improve overall ILP, instruction
+ * level parallelism, on a given CPU implementation in this case.
+ *
+ * Special note on Intel EM64T. While Opteron CPU exhibits perfect
+ * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
+ * [currently available] EM64T CPUs apparently are far from it. On the
+ * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+ * sha256_block:-( This is presumably because 64-bit shifts/rotates
+ * apparently are not atomic instructions, but implemented in microcode.
+ */
+
+/*
+ * OpenSolaris OS modifications
+ *
+ * Sun elects to use this software under the BSD license.
+ *
+ * This source originates from OpenSSL file sha512-x86_64.pl at
+ * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz
+ * (presumably for future OpenSSL release 0.9.8h), with these changes:
+ *
+ * 1. Added perl "use strict" and declared variables.
+ *
+ * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards.
+ *
+ * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1)
+ * assemblers).  Replaced the .picmeup macro with assembler code.
+ *
+ * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype",
+ * at the beginning of SHA2_CTX (the next field is 8-byte aligned).
+ */
+
+/*
+ * This file was generated by a perl script (sha512-x86_64.pl) that were
+ * used to generate sha256 and sha512 variants from the same code base.
+ * The comments from the original file have been pasted above.
+ */
+
+
+#if defined(lint) || defined(__lint)
+#include <sys/stdint.h>
+#include <sha2/sha2.h>
+
+/* ARGSUSED */
+void
+SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num)
+{
+}
+
+
+#else
+#define _ASM
+#include <sys/asm_linkage.h>
+
+ENTRY_NP(SHA512TransformBlocks)
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	mov	%rsp,%rbp		# copy %rsp
+	shl	$4,%rdx		# num*16
+	sub	$16*8+4*8,%rsp
+	lea	(%rsi,%rdx,8),%rdx	# inp+num*16*8
+	and	$-64,%rsp		# align stack frame
+	add	$8,%rdi		# Skip OpenSolaris field, "algotype"
+	mov	%rdi,16*8+0*8(%rsp)		# save ctx, 1st arg
+	mov	%rsi,16*8+1*8(%rsp)		# save inp, 2nd arg
+	mov	%rdx,16*8+2*8(%rsp)		# save end pointer, "3rd" arg
+	mov	%rbp,16*8+3*8(%rsp)		# save copy of %rsp
+
+	/.picmeup %rbp
+	/ The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts
+	/ the address of the "next" instruction into the target register
+	/ (%rbp).  This generates these 2 instructions:
+	lea	.Llea(%rip),%rbp
+	/nop	/ .picmeup generates a nop for mod 8 alignment--not needed here
+
+.Llea:
+	lea	K512-.(%rbp),%rbp
+
+	mov	8*0(%rdi),%rax
+	mov	8*1(%rdi),%rbx
+	mov	8*2(%rdi),%rcx
+	mov	8*3(%rdi),%rdx
+	mov	8*4(%rdi),%r8
+	mov	8*5(%rdi),%r9
+	mov	8*6(%rdi),%r10
+	mov	8*7(%rdi),%r11
+	jmp	.Lloop
+
+.align	16
+.Lloop:
+	xor	%rdi,%rdi
+	mov	8*0(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*1(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*2(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*3(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*4(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*5(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*6(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*7(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	8*8(%rsi),%r12
+	bswap	%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	8*9(%rsi),%r12
+	bswap	%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	8*10(%rsi),%r12
+	bswap	%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	8*11(%rsi),%r12
+	bswap	%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	8*12(%rsi),%r12
+	bswap	%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	8*13(%rsi),%r12
+	bswap	%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	8*14(%rsi),%r12
+	bswap	%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	8*15(%rsi),%r12
+	bswap	%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	jmp	.Lrounds_16_xx
+.align	16
+.Lrounds_16_xx:
+	mov	8(%rsp),%r13
+	mov	112(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	72(%rsp),%r12
+
+	add	0(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,0(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	16(%rsp),%r13
+	mov	120(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	80(%rsp),%r12
+
+	add	8(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,8(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	24(%rsp),%r13
+	mov	0(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	88(%rsp),%r12
+
+	add	16(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,16(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	32(%rsp),%r13
+	mov	8(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	96(%rsp),%r12
+
+	add	24(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,24(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	40(%rsp),%r13
+	mov	16(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	104(%rsp),%r12
+
+	add	32(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,32(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	48(%rsp),%r13
+	mov	24(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	112(%rsp),%r12
+
+	add	40(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,40(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	56(%rsp),%r13
+	mov	32(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	120(%rsp),%r12
+
+	add	48(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,48(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	64(%rsp),%r13
+	mov	40(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	0(%rsp),%r12
+
+	add	56(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,56(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	mov	72(%rsp),%r13
+	mov	48(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	8(%rsp),%r12
+
+	add	64(%rsp),%r12
+	mov	%r8,%r13
+	mov	%r8,%r14
+	mov	%r9,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r10,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r8,%r15			# (f^g)&e
+	mov	%r12,64(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r10,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r11,%r12			# T1+=h
+
+	mov	%rax,%r11
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rax,%r13
+	mov	%rax,%r14
+
+	ror	$28,%r11
+	ror	$34,%r13
+	mov	%rax,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r11
+	ror	$5,%r13
+	or	%rcx,%r14			# a|c
+
+	xor	%r13,%r11			# h=Sigma0(a)
+	and	%rcx,%r15			# a&c
+	add	%r12,%rdx			# d+=T1
+
+	and	%rbx,%r14			# (a|c)&b
+	add	%r12,%r11			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r11			# h+=Maj(a,b,c)
+	mov	80(%rsp),%r13
+	mov	56(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	16(%rsp),%r12
+
+	add	72(%rsp),%r12
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+	mov	%r8,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r9,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rdx,%r15			# (f^g)&e
+	mov	%r12,72(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r9,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r10,%r12			# T1+=h
+
+	mov	%r11,%r10
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r11,%r13
+	mov	%r11,%r14
+
+	ror	$28,%r10
+	ror	$34,%r13
+	mov	%r11,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r10
+	ror	$5,%r13
+	or	%rbx,%r14			# a|c
+
+	xor	%r13,%r10			# h=Sigma0(a)
+	and	%rbx,%r15			# a&c
+	add	%r12,%rcx			# d+=T1
+
+	and	%rax,%r14			# (a|c)&b
+	add	%r12,%r10			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r10			# h+=Maj(a,b,c)
+	mov	88(%rsp),%r13
+	mov	64(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	24(%rsp),%r12
+
+	add	80(%rsp),%r12
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+	mov	%rdx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r8,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rcx,%r15			# (f^g)&e
+	mov	%r12,80(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r8,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r9,%r12			# T1+=h
+
+	mov	%r10,%r9
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r10,%r13
+	mov	%r10,%r14
+
+	ror	$28,%r9
+	ror	$34,%r13
+	mov	%r10,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r9
+	ror	$5,%r13
+	or	%rax,%r14			# a|c
+
+	xor	%r13,%r9			# h=Sigma0(a)
+	and	%rax,%r15			# a&c
+	add	%r12,%rbx			# d+=T1
+
+	and	%r11,%r14			# (a|c)&b
+	add	%r12,%r9			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r9			# h+=Maj(a,b,c)
+	mov	96(%rsp),%r13
+	mov	72(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	32(%rsp),%r12
+
+	add	88(%rsp),%r12
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+	mov	%rcx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rdx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rbx,%r15			# (f^g)&e
+	mov	%r12,88(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rdx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%r8,%r12			# T1+=h
+
+	mov	%r9,%r8
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r9,%r13
+	mov	%r9,%r14
+
+	ror	$28,%r8
+	ror	$34,%r13
+	mov	%r9,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%r8
+	ror	$5,%r13
+	or	%r11,%r14			# a|c
+
+	xor	%r13,%r8			# h=Sigma0(a)
+	and	%r11,%r15			# a&c
+	add	%r12,%rax			# d+=T1
+
+	and	%r10,%r14			# (a|c)&b
+	add	%r12,%r8			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%r8			# h+=Maj(a,b,c)
+	mov	104(%rsp),%r13
+	mov	80(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	40(%rsp),%r12
+
+	add	96(%rsp),%r12
+	mov	%rax,%r13
+	mov	%rax,%r14
+	mov	%rbx,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rcx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%rax,%r15			# (f^g)&e
+	mov	%r12,96(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rcx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rdx,%r12			# T1+=h
+
+	mov	%r8,%rdx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%r8,%r13
+	mov	%r8,%r14
+
+	ror	$28,%rdx
+	ror	$34,%r13
+	mov	%r8,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rdx
+	ror	$5,%r13
+	or	%r10,%r14			# a|c
+
+	xor	%r13,%rdx			# h=Sigma0(a)
+	and	%r10,%r15			# a&c
+	add	%r12,%r11			# d+=T1
+
+	and	%r9,%r14			# (a|c)&b
+	add	%r12,%rdx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rdx			# h+=Maj(a,b,c)
+	mov	112(%rsp),%r13
+	mov	88(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	48(%rsp),%r12
+
+	add	104(%rsp),%r12
+	mov	%r11,%r13
+	mov	%r11,%r14
+	mov	%rax,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rbx,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r11,%r15			# (f^g)&e
+	mov	%r12,104(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rbx,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rcx,%r12			# T1+=h
+
+	mov	%rdx,%rcx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rdx,%r13
+	mov	%rdx,%r14
+
+	ror	$28,%rcx
+	ror	$34,%r13
+	mov	%rdx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rcx
+	ror	$5,%r13
+	or	%r9,%r14			# a|c
+
+	xor	%r13,%rcx			# h=Sigma0(a)
+	and	%r9,%r15			# a&c
+	add	%r12,%r10			# d+=T1
+
+	and	%r8,%r14			# (a|c)&b
+	add	%r12,%rcx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rcx			# h+=Maj(a,b,c)
+	mov	120(%rsp),%r13
+	mov	96(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	56(%rsp),%r12
+
+	add	112(%rsp),%r12
+	mov	%r10,%r13
+	mov	%r10,%r14
+	mov	%r11,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%rax,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r10,%r15			# (f^g)&e
+	mov	%r12,112(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%rax,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rbx,%r12			# T1+=h
+
+	mov	%rcx,%rbx
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rcx,%r13
+	mov	%rcx,%r14
+
+	ror	$28,%rbx
+	ror	$34,%r13
+	mov	%rcx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rbx
+	ror	$5,%r13
+	or	%r8,%r14			# a|c
+
+	xor	%r13,%rbx			# h=Sigma0(a)
+	and	%r8,%r15			# a&c
+	add	%r12,%r9			# d+=T1
+
+	and	%rdx,%r14			# (a|c)&b
+	add	%r12,%rbx			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rbx			# h+=Maj(a,b,c)
+	mov	0(%rsp),%r13
+	mov	104(%rsp),%r12
+
+	mov	%r13,%r15
+
+	shr	$7,%r13
+	ror	$1,%r15
+
+	xor	%r15,%r13
+	ror	$7,%r15
+
+	xor	%r15,%r13			# sigma0(X[(i+1)&0xf])
+	mov	%r12,%r14
+
+	shr	$6,%r12
+	ror	$19,%r14
+
+	xor	%r14,%r12
+	ror	$42,%r14
+
+	xor	%r14,%r12			# sigma1(X[(i+14)&0xf])
+
+	add	%r13,%r12
+
+	add	64(%rsp),%r12
+
+	add	120(%rsp),%r12
+	mov	%r9,%r13
+	mov	%r9,%r14
+	mov	%r10,%r15
+
+	ror	$14,%r13
+	ror	$18,%r14
+	xor	%r11,%r15			# f^g
+
+	xor	%r14,%r13
+	ror	$23,%r14
+	and	%r9,%r15			# (f^g)&e
+	mov	%r12,120(%rsp)
+
+	xor	%r14,%r13			# Sigma1(e)
+	xor	%r11,%r15			# Ch(e,f,g)=((f^g)&e)^g
+	add	%rax,%r12			# T1+=h
+
+	mov	%rbx,%rax
+	add	%r13,%r12			# T1+=Sigma1(e)
+
+	add	%r15,%r12			# T1+=Ch(e,f,g)
+	mov	%rbx,%r13
+	mov	%rbx,%r14
+
+	ror	$28,%rax
+	ror	$34,%r13
+	mov	%rbx,%r15
+	add	(%rbp,%rdi,8),%r12	# T1+=K[round]
+
+	xor	%r13,%rax
+	ror	$5,%r13
+	or	%rdx,%r14			# a|c
+
+	xor	%r13,%rax			# h=Sigma0(a)
+	and	%rdx,%r15			# a&c
+	add	%r12,%r8			# d+=T1
+
+	and	%rcx,%r14			# (a|c)&b
+	add	%r12,%rax			# h+=T1
+
+	or	%r15,%r14			# Maj(a,b,c)=((a|c)&b)|(a&c)
+	lea	1(%rdi),%rdi	# round++
+
+	add	%r14,%rax			# h+=Maj(a,b,c)
+	cmp	$80,%rdi
+	jb	.Lrounds_16_xx
+
+	mov	16*8+0*8(%rsp),%rdi
+	lea	16*8(%rsi),%rsi
+
+	add	8*0(%rdi),%rax
+	add	8*1(%rdi),%rbx
+	add	8*2(%rdi),%rcx
+	add	8*3(%rdi),%rdx
+	add	8*4(%rdi),%r8
+	add	8*5(%rdi),%r9
+	add	8*6(%rdi),%r10
+	add	8*7(%rdi),%r11
+
+	cmp	16*8+2*8(%rsp),%rsi
+
+	mov	%rax,8*0(%rdi)
+	mov	%rbx,8*1(%rdi)
+	mov	%rcx,8*2(%rdi)
+	mov	%rdx,8*3(%rdi)
+	mov	%r8,8*4(%rdi)
+	mov	%r9,8*5(%rdi)
+	mov	%r10,8*6(%rdi)
+	mov	%r11,8*7(%rdi)
+	jb	.Lloop
+
+	mov	16*8+3*8(%rsp),%rsp
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+
+	ret
+SET_SIZE(SHA512TransformBlocks)
+
+.align	64
+.type	K512,@object
+K512:
+	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
+	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
+	.quad	0x3956c25bf348b538,0x59f111f1b605d019
+	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
+	.quad	0xd807aa98a3030242,0x12835b0145706fbe
+	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
+	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
+	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
+	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
+	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
+	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
+	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
+	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
+	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
+	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
+	.quad	0x06ca6351e003826f,0x142929670a0e6e70
+	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
+	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
+	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
+	.quad	0x81c2c92e47edaee6,0x92722c851482353b
+	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
+	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
+	.quad	0xd192e819d6ef5218,0xd69906245565a910
+	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
+	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
+	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
+	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
+	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
+	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
+	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
+	.quad	0x90befffa23631e28,0xa4506cebde82bde9
+	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
+	.quad	0xca273eceea26619c,0xd186b8c721c0c207
+	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
+	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
+	.quad	0x113f9804bef90dae,0x1b710b35131c471b
+	.quad	0x28db77f523047d84,0x32caab7b40c72493
+	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
+	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
+	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
+#endif /* !lint && !__lint */
diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c
index 7dd5dbf42..aa63e431f 100644
--- a/module/icp/illumos-crypto.c
+++ b/module/icp/illumos-crypto.c
@@ -109,8 +109,10 @@
 void __exit
 icp_fini(void)
 {
+	skein_mod_fini();
 	sha2_mod_fini();
 	sha1_mod_fini();
+	edonr_mod_fini();
 	aes_mod_fini();
 	kcf_sched_destroy();
 	kcf_prov_tab_destroy();
@@ -139,8 +141,10 @@ icp_init(void)
 
 	/* initialize algorithms */
 	aes_mod_init();
+	edonr_mod_init();
 	sha1_mod_init();
 	sha2_mod_init();
+	skein_mod_init();
 
 	return (0);
 }
diff --git a/module/icp/include/sha2/sha2_impl.h b/module/icp/include/sha2/sha2_impl.h
index bb42c3cd4..b9768d344 100644
--- a/module/icp/include/sha2/sha2_impl.h
+++ b/module/icp/include/sha2/sha2_impl.h
@@ -26,6 +26,8 @@
 #ifndef	_SHA2_IMPL_H
 #define	_SHA2_IMPL_H
 
+#include <sys/sha2.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/module/icp/io/edonr_mod.c b/module/icp/io/edonr_mod.c
new file mode 100644
index 000000000..19b5c963d
--- /dev/null
+++ b/module/icp/io/edonr_mod.c
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#include <sys/edonr.h>
+
+/*
+ * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic
+ * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose
+ * cryptographic use. Users of Edon-R must interface directly to this module.
+ */
+
+static struct modlmisc modlmisc = {
+	&mod_cryptoops,
+	"Edon-R Message-Digest Algorithm"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlmisc, NULL}
+};
+
+int
+edonr_mod_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	return (0);
+}
+
+int
+edonr_mod_fini(void) {
+	return (mod_remove(&modlinkage));
+}
diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c
index be0f7a42c..3913d7618 100644
--- a/module/icp/io/sha2_mod.c
+++ b/module/icp/io/sha2_mod.c
@@ -30,7 +30,7 @@
 #include <sys/crypto/spi.h>
 #include <sys/crypto/icp.h>
 #define	_SHA2_IMPL
-#include <sha2/sha2.h>
+#include <sys/sha2.h>
 #include <sha2/sha2_impl.h>
 
 /*
diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c
new file mode 100644
index 000000000..e909a7e31
--- /dev/null
+++ b/module/icp/io/skein_mod.c
@@ -0,0 +1,721 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+#include <sys/modctl.h>
+#include <sys/crypto/common.h>
+#include <sys/crypto/spi.h>
+#include <sys/sysmacros.h>
+#include <sys/systm.h>
+#define	SKEIN_MODULE_IMPL
+#include <sys/skein.h>
+
+/*
+ * Like the sha2 module, we create the skein module with two modlinkages:
+ * - modlmisc to allow direct calls to Skein_* API functions.
+ * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF).
+ */
+static struct modlmisc modlmisc = {
+	&mod_cryptoops,
+	"Skein Message-Digest Algorithm"
+};
+
+static struct modlcrypto modlcrypto = {
+	&mod_cryptoops,
+	"Skein Kernel SW Provider"
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1, {&modlmisc, &modlcrypto, NULL}
+};
+
+static crypto_mech_info_t skein_mech_info_tab[] = {
+	{CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	{CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES},
+	{CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE,
+	    CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC,
+	    0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS},
+	{CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE,
+	    CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX,
+	    CRYPTO_KEYSIZE_UNIT_IN_BYTES}
+};
+
+static void skein_provider_status(crypto_provider_handle_t, uint_t *);
+
+static crypto_control_ops_t skein_control_ops = {
+	skein_provider_status
+};
+
+static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *,
+    crypto_req_handle_t);
+static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t);
+static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_data_t *, crypto_data_t *,
+    crypto_req_handle_t);
+
+static crypto_digest_ops_t skein_digest_ops = {
+	skein_digest_init,
+	skein_digest,
+	skein_update,
+	NULL,
+	skein_final,
+	skein_digest_atomic
+};
+
+static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *,
+    crypto_spi_ctx_template_t, crypto_req_handle_t);
+
+static crypto_mac_ops_t skein_mac_ops = {
+	skein_mac_init,
+	NULL,
+	skein_update,	/* using regular digest update is OK here */
+	skein_final,	/* using regular digest final is OK here */
+	skein_mac_atomic,
+	NULL
+};
+
+static int skein_create_ctx_template(crypto_provider_handle_t,
+    crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *,
+    size_t *, crypto_req_handle_t);
+static int skein_free_context(crypto_ctx_t *);
+
+static crypto_ctx_ops_t skein_ctx_ops = {
+	skein_create_ctx_template,
+	skein_free_context
+};
+
+static crypto_ops_t skein_crypto_ops = {{{{{
+	&skein_control_ops,
+	&skein_digest_ops,
+	NULL,
+	&skein_mac_ops,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	&skein_ctx_ops,
+}}}}};
+
+static crypto_provider_info_t skein_prov_info = {{{{
+	CRYPTO_SPI_VERSION_1,
+	"Skein Software Provider",
+	CRYPTO_SW_PROVIDER,
+	NULL,
+	&skein_crypto_ops,
+	sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t),
+	skein_mech_info_tab
+}}}};
+
+static crypto_kcf_provider_handle_t skein_prov_handle = 0;
+
+typedef struct skein_ctx {
+	skein_mech_type_t		sc_mech_type;
+	size_t				sc_digest_bitlen;
+	/*LINTED(E_ANONYMOUS_UNION_DECL)*/
+	union {
+		Skein_256_Ctxt_t	sc_256;
+		Skein_512_Ctxt_t	sc_512;
+		Skein1024_Ctxt_t	sc_1024;
+	};
+} skein_ctx_t;
+#define	SKEIN_CTX(_ctx_)	((skein_ctx_t *)((_ctx_)->cc_provider_private))
+#define	SKEIN_CTX_LVALUE(_ctx_)	(_ctx_)->cc_provider_private
+#define	SKEIN_OP(_skein_ctx, _op, ...)					\
+	do {								\
+		skein_ctx_t	*sc = (_skein_ctx);			\
+		switch (sc->sc_mech_type) {				\
+		case SKEIN_256_MECH_INFO_TYPE:				\
+		case SKEIN_256_MAC_MECH_INFO_TYPE:			\
+			(void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\
+			break;						\
+		case SKEIN_512_MECH_INFO_TYPE:				\
+		case SKEIN_512_MAC_MECH_INFO_TYPE:			\
+			(void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\
+			break;						\
+		case SKEIN1024_MECH_INFO_TYPE:				\
+		case SKEIN1024_MAC_MECH_INFO_TYPE:			\
+			(void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\
+			break;						\
+		}							\
+		_NOTE(CONSTCOND)					\
+	} while (0)
+
+static int
+skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result)
+{
+	if (mechanism->cm_param != NULL) {
+		/*LINTED(E_BAD_PTR_CAST_ALIGN)*/
+		skein_param_t	*param = (skein_param_t *)mechanism->cm_param;
+
+		if (mechanism->cm_param_len != sizeof (*param) ||
+		    param->sp_digest_bitlen == 0) {
+			return (CRYPTO_MECHANISM_PARAM_INVALID);
+		}
+		*result = param->sp_digest_bitlen;
+	} else {
+		switch (mechanism->cm_type) {
+		case SKEIN_256_MECH_INFO_TYPE:
+			*result = 256;
+			break;
+		case SKEIN_512_MECH_INFO_TYPE:
+			*result = 512;
+			break;
+		case SKEIN1024_MECH_INFO_TYPE:
+			*result = 1024;
+			break;
+		default:
+			return (CRYPTO_MECHANISM_INVALID);
+		}
+	}
+	return (CRYPTO_SUCCESS);
+}
+
+int
+skein_mod_init(void)
+{
+	int error;
+
+	if ((error = mod_install(&modlinkage)) != 0)
+		return (error);
+
+	/*
+	 * Try to register with KCF - failure shouldn't unload us, since we
+	 * still may want to continue providing misc/skein functionality.
+	 */
+	(void) crypto_register_provider(&skein_prov_info, &skein_prov_handle);
+
+	return (0);
+}
+
+int
+skein_mod_fini(void) {
+	return (mod_remove(&modlinkage));
+}
+
+/*
+ * KCF software provider control entry points.
+ */
+/* ARGSUSED */
+static void
+skein_provider_status(crypto_provider_handle_t provider, uint_t *status)
+{
+	*status = CRYPTO_PROVIDER_READY;
+}
+
+/*
+ * General Skein hashing helper functions.
+ */
+
+/*
+ * Performs an Update on a context with uio input data.
+ */
+static int
+skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data)
+{
+	off_t		offset = data->cd_offset;
+	size_t		length = data->cd_length;
+	uint_t		vec_idx;
+	size_t		cur_len;
+	const uio_t	*uio = data->cd_uio;
+
+	/* we support only kernel buffer */
+	if (uio->uio_segflg != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing data to be
+	 * digested.
+	 */
+	for (vec_idx = 0; vec_idx < uio->uio_iovcnt &&
+	    offset >= uio->uio_iov[vec_idx].iov_len;
+	    offset -= uio->uio_iov[vec_idx++].iov_len)
+		;
+	if (vec_idx == uio->uio_iovcnt) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	/*
+	 * Now do the digesting on the iovecs.
+	 */
+	while (vec_idx < uio->uio_iovcnt && length > 0) {
+		cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset, length);
+		SKEIN_OP(ctx, Update, (uint8_t *)uio->uio_iov[vec_idx].iov_base
+		    + offset, cur_len);
+		length -= cur_len;
+		vec_idx++;
+		offset = 0;
+	}
+
+	if (vec_idx == uio->uio_iovcnt && length > 0) {
+		/*
+		 * The end of the specified iovec's was reached but
+		 * the length requested could not be processed, i.e.
+		 * The caller requested to digest more data than it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * Performs a Final on a context and writes to a uio digest output.
+ */
+static int
+skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	off_t	offset = digest->cd_offset;
+	uint_t	vec_idx;
+	uio_t	*uio = digest->cd_uio;
+
+	/* we support only kernel buffer */
+	if (uio->uio_segflg != UIO_SYSSPACE)
+		return (CRYPTO_ARGUMENTS_BAD);
+
+	/*
+	 * Jump to the first iovec containing ptr to the digest to be returned.
+	 */
+	for (vec_idx = 0; offset >= uio->uio_iov[vec_idx].iov_len &&
+	    vec_idx < uio->uio_iovcnt;
+	    offset -= uio->uio_iov[vec_idx++].iov_len)
+		;
+	if (vec_idx == uio->uio_iovcnt) {
+		/*
+		 * The caller specified an offset that is larger than the
+		 * total size of the buffers it provided.
+		 */
+		return (CRYPTO_DATA_LEN_RANGE);
+	}
+	if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <=
+	    uio->uio_iov[vec_idx].iov_len) {
+		/* The computed digest will fit in the current iovec. */
+		SKEIN_OP(ctx, Final,
+		    (uchar_t *)uio->uio_iov[vec_idx].iov_base + offset);
+	} else {
+		uint8_t *digest_tmp;
+		off_t scratch_offset = 0;
+		size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen);
+		size_t cur_len;
+
+		digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES(
+		    ctx->sc_digest_bitlen), crypto_kmflag(req));
+		if (digest_tmp == NULL)
+			return (CRYPTO_HOST_MEMORY);
+		SKEIN_OP(ctx, Final, digest_tmp);
+		while (vec_idx < uio->uio_iovcnt && length > 0) {
+			cur_len = MIN(uio->uio_iov[vec_idx].iov_len - offset,
+			    length);
+			bcopy(digest_tmp + scratch_offset,
+			    uio->uio_iov[vec_idx].iov_base + offset, cur_len);
+
+			length -= cur_len;
+			vec_idx++;
+			scratch_offset += cur_len;
+			offset = 0;
+		}
+		kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen));
+
+		if (vec_idx == uio->uio_iovcnt && length > 0) {
+			/*
+			 * The end of the specified iovec's was reached but
+			 * the length requested could not be processed, i.e.
+			 * The caller requested to digest more data than it
+			 * provided.
+			 */
+			return (CRYPTO_DATA_LEN_RANGE);
+		}
+	}
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provider digest entry points.
+ */
+
+/*
+ * Initializes a skein digest context to the configuration in `mechanism'.
+ * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param
+ * field may contain a skein_param_t structure indicating the length of the
+ * digest the algorithm should produce. Otherwise the default output lengths
+ * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes
+ * for Skein-1024).
+ */
+static int
+skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_req_handle_t req)
+{
+	int	error = CRYPTO_SUCCESS;
+
+	if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+
+	SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+	    crypto_kmflag(req));
+	if (SKEIN_CTX(ctx) == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism,
+	    &SKEIN_CTX(ctx)->sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		goto errout;
+	SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen);
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	SKEIN_CTX_LVALUE(ctx) = NULL;
+	return (error);
+}
+
+/*
+ * Executes a skein_update and skein_digest on a pre-initialized crypto
+ * context in a single step. See the documentation to these functions to
+ * see what to pass here.
+ */
+static int
+skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest,
+    crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	if (digest->cd_length <
+	    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	error = skein_update(ctx, data, req);
+	if (error != CRYPTO_SUCCESS) {
+		bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		SKEIN_CTX_LVALUE(ctx) = NULL;
+		digest->cd_length = 0;
+		return (error);
+	}
+	error = skein_final(ctx, digest, req);
+
+	return (error);
+}
+
+/*
+ * Performs a skein Update with the input message in `data' (successive calls
+ * can push more data). This is used both for digest and MAC operation.
+ * Supported input data formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	switch (data->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SKEIN_OP(SKEIN_CTX(ctx), Update,
+		    (uint8_t *)data->cd_raw.iov_base + data->cd_offset,
+		    data->cd_length);
+		break;
+	case CRYPTO_DATA_UIO:
+		error = skein_digest_update_uio(SKEIN_CTX(ctx), data);
+		break;
+	default:
+		error = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	return (error);
+}
+
+/*
+ * Performs a skein Final, writing the output to `digest'. This is used both
+ * for digest and MAC operation.
+ * Supported output digest formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int error = CRYPTO_SUCCESS;
+
+	ASSERT(SKEIN_CTX(ctx) != NULL);
+
+	if (digest->cd_length <
+	    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) {
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+		return (CRYPTO_BUFFER_TOO_SMALL);
+	}
+
+	switch (digest->cd_format) {
+	case CRYPTO_DATA_RAW:
+		SKEIN_OP(SKEIN_CTX(ctx), Final,
+		    (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset);
+		break;
+	case CRYPTO_DATA_UIO:
+		error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req);
+		break;
+	default:
+		error = CRYPTO_ARGUMENTS_BAD;
+	}
+
+	if (error == CRYPTO_SUCCESS)
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen);
+	else
+		digest->cd_length = 0;
+
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx))));
+	SKEIN_CTX_LVALUE(ctx) = NULL;
+
+	return (error);
+}
+
+/*
+ * Performs a full skein digest computation in a single call, configuring the
+ * algorithm according to `mechanism', reading the input to be digested from
+ * `data' and writing the output to `digest'.
+ * Supported input/output formats are raw, uio and mblk.
+ */
+/*ARGSUSED*/
+static int
+skein_digest_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req)
+{
+	int		error;
+	skein_ctx_t	skein_ctx;
+	crypto_ctx_t	ctx;
+	SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+	/* Init */
+	if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+	skein_ctx.sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		goto out;
+	SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen);
+
+	if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS)
+		goto out;
+	if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS)
+		goto out;
+
+out:
+	if (error == CRYPTO_SUCCESS)
+		digest->cd_length =
+		    CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen);
+	else
+		digest->cd_length = 0;
+	bzero(&skein_ctx, sizeof (skein_ctx));
+
+	return (error);
+}
+
+/*
+ * Helper function that builds a Skein MAC context from the provided
+ * mechanism and key.
+ */
+static int
+skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key)
+{
+	int error;
+
+	if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type))
+		return (CRYPTO_MECHANISM_INVALID);
+	if (key->ck_format != CRYPTO_KEY_RAW)
+		return (CRYPTO_ARGUMENTS_BAD);
+	ctx->sc_mech_type = mechanism->cm_type;
+	error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen);
+	if (error != CRYPTO_SUCCESS)
+		return (error);
+	SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data,
+	    CRYPTO_BITS2BYTES(key->ck_length));
+
+	return (CRYPTO_SUCCESS);
+}
+
+/*
+ * KCF software provide mac entry points.
+ */
+/*
+ * Initializes a skein MAC context. You may pass a ctx_template, in which
+ * case the template will be reused to make initialization more efficient.
+ * Otherwise a new context will be constructed. The mechanism cm_type must
+ * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you
+ * may pass a skein_param_t in cm_param to configure the length of the
+ * digest. The key must be in raw format.
+ */
+static int
+skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_spi_ctx_template_t ctx_template,
+    crypto_req_handle_t req)
+{
+	int	error;
+
+	SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)),
+	    crypto_kmflag(req));
+	if (SKEIN_CTX(ctx) == NULL)
+		return (CRYPTO_HOST_MEMORY);
+
+	if (ctx_template != NULL) {
+		bcopy(ctx_template, SKEIN_CTX(ctx),
+		    sizeof (*SKEIN_CTX(ctx)));
+	} else {
+		error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key);
+		if (error != CRYPTO_SUCCESS)
+			goto errout;
+	}
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+	return (error);
+}
+
+/*
+ * The MAC update and final calls are reused from the regular digest code.
+ */
+
+/*ARGSUSED*/
+/*
+ * Same as skein_digest_atomic, performs an atomic Skein MAC operation in
+ * one step. All the same properties apply to the arguments of this
+ * function as to those of the partial operations above.
+ */
+static int
+skein_mac_atomic(crypto_provider_handle_t provider,
+    crypto_session_id_t session_id, crypto_mechanism_t *mechanism,
+    crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac,
+    crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req)
+{
+	/* faux crypto context just for skein_digest_{update,final} */
+	int		error;
+	crypto_ctx_t	ctx;
+	skein_ctx_t	skein_ctx;
+	SKEIN_CTX_LVALUE(&ctx) = &skein_ctx;
+
+	if (ctx_template != NULL) {
+		bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx));
+	} else {
+		error = skein_mac_ctx_build(&skein_ctx, mechanism, key);
+		if (error != CRYPTO_SUCCESS)
+			goto errout;
+	}
+
+	if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS)
+		goto errout;
+	if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS)
+		goto errout;
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(&skein_ctx, sizeof (skein_ctx));
+	return (error);
+}
+
+/*
+ * KCF software provider context management entry points.
+ */
+
+/*
+ * Constructs a context template for the Skein MAC algorithm. The same
+ * properties apply to the arguments of this function as to those of
+ * skein_mac_init.
+ */
+/*ARGSUSED*/
+static int
+skein_create_ctx_template(crypto_provider_handle_t provider,
+    crypto_mechanism_t *mechanism, crypto_key_t *key,
+    crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size,
+    crypto_req_handle_t req)
+{
+	int		error;
+	skein_ctx_t	*ctx_tmpl;
+
+	ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req));
+	if (ctx_tmpl == NULL)
+		return (CRYPTO_HOST_MEMORY);
+	error = skein_mac_ctx_build(ctx_tmpl, mechanism, key);
+	if (error != CRYPTO_SUCCESS)
+		goto errout;
+	*ctx_template = ctx_tmpl;
+	*ctx_template_size = sizeof (*ctx_tmpl);
+
+	return (CRYPTO_SUCCESS);
+errout:
+	bzero(ctx_tmpl, sizeof (*ctx_tmpl));
+	kmem_free(ctx_tmpl, sizeof (*ctx_tmpl));
+	return (error);
+}
+
+/*
+ * Frees a skein context in a parent crypto context.
+ */
+static int
+skein_free_context(crypto_ctx_t *ctx)
+{
+	if (SKEIN_CTX(ctx) != NULL) {
+		bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx)));
+		SKEIN_CTX_LVALUE(ctx) = NULL;
+	}
+
+	return (CRYPTO_SUCCESS);
+}
diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
index 5436bae9a..8a975ecb3 100644
--- a/module/zcommon/zfs_fletcher.c
+++ b/module/zcommon/zfs_fletcher.c
@@ -23,6 +23,9 @@
  * Use is subject to license terms.
  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  */
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 
 /*
  * Fletcher Checksums
@@ -206,8 +209,10 @@ static struct fletcher_4_kstat {
 /* Indicate that benchmark has been completed */
 static boolean_t fletcher_4_initialized = B_FALSE;
 
+/*ARGSUSED*/
 void
-fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_2_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
@@ -223,8 +228,10 @@ fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 }
 
+/*ARGSUSED*/
 void
-fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_2_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const uint64_t *ip = buf;
 	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
@@ -404,8 +411,10 @@ fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
 		ops->fini_native(zcp);
 }
 
+/*ARGSUSED*/
 void
-fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_4_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const fletcher_4_ops_t *ops;
 	uint64_t p2size = P2ALIGN(size, 64);
@@ -443,8 +452,10 @@ fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf,
 		ops->fini_byteswap(zcp);
 }
 
+/*ARGSUSED*/
 void
-fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+fletcher_4_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	const fletcher_4_ops_t *ops;
 	uint64_t p2size = P2ALIGN(size, 64);
@@ -551,7 +562,7 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 		start = gethrtime();
 		do {
 			for (l = 0; l < 32; l++, run_count++)
-				fletcher_4_test(data, data_size, &zc);
+				fletcher_4_test(data, data_size, NULL, &zc);
 
 			run_time_ns = gethrtime() - start;
 		} while (run_time_ns < FLETCHER_4_BENCH_NS);
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c
index 1d68ca29e..029075ebe 100644
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -70,6 +70,10 @@ zfs_prop_init(void)
 		{ "fletcher2",	ZIO_CHECKSUM_FLETCHER_2 },
 		{ "fletcher4",	ZIO_CHECKSUM_FLETCHER_4 },
 		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "noparity",   ZIO_CHECKSUM_NOPARITY },
+		{ "sha512",	ZIO_CHECKSUM_SHA512 },
+		{ "skein",	ZIO_CHECKSUM_SKEIN },
+		{ "edonr",	ZIO_CHECKSUM_EDONR },
 		{ NULL }
 	};
 
@@ -80,6 +84,14 @@ zfs_prop_init(void)
 		{ "sha256",	ZIO_CHECKSUM_SHA256 },
 		{ "sha256,verify",
 				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+		{ "sha512",	ZIO_CHECKSUM_SHA512 },
+		{ "sha512,verify",
+				ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY },
+		{ "skein",	ZIO_CHECKSUM_SKEIN },
+		{ "skein,verify",
+				ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY },
+		{ "edonr,verify",
+				ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY },
 		{ NULL }
 	};
 
@@ -241,12 +253,12 @@ zfs_prop_init(void)
 	zprop_register_index(ZFS_PROP_CHECKSUM, "checksum",
 	    ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM |
 	    ZFS_TYPE_VOLUME,
-	    "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
-	    checksum_table);
+	    "on | off | fletcher2 | fletcher4 | sha256 | sha512 | "
+	    "skein | edonr", "CHECKSUM", checksum_table);
 	zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | verify | sha256[,verify]", "DEDUP",
-	    dedup_table);
+	    "on | off | verify | sha256[,verify], sha512[,verify], "
+	    "skein[,verify], edonr,verify", "DEDUP", dedup_table);
 	zprop_register_index(ZFS_PROP_COMPRESSION, "compression",
 	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in
index e1771b233..ce368880c 100644
--- a/module/zfs/Makefile.in
+++ b/module/zfs/Makefile.in
@@ -36,6 +36,7 @@ $(MODULE)-objs += dsl_pool.o
 $(MODULE)-objs += dsl_prop.o
 $(MODULE)-objs += dsl_scan.o
 $(MODULE)-objs += dsl_synctask.o
+$(MODULE)-objs += edonr_zfs.o
 $(MODULE)-objs += fm.o
 $(MODULE)-objs += gzip.o
 $(MODULE)-objs += lzjb.o
@@ -49,6 +50,7 @@ $(MODULE)-objs += refcount.o
 $(MODULE)-objs += rrwlock.o
 $(MODULE)-objs += sa.o
 $(MODULE)-objs += sha256.o
+$(MODULE)-objs += skein_zfs.o
 $(MODULE)-objs += spa.o
 $(MODULE)-objs += spa_boot.o
 $(MODULE)-objs += spa_config.o
diff --git a/module/zfs/arc.c b/module/zfs/arc.c
index bf078aa94..7bae2c42d 100755
--- a/module/zfs/arc.c
+++ b/module/zfs/arc.c
@@ -1382,7 +1382,7 @@ arc_cksum_verify(arc_buf_t *buf)
 		return;
 	}
 
-	fletcher_2_native(buf->b_data, arc_buf_size(buf), &zc);
+	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
@@ -1495,7 +1495,7 @@ arc_cksum_compute(arc_buf_t *buf)
 	ASSERT(!ARC_BUF_COMPRESSED(buf));
 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
 	    KM_SLEEP);
-	fletcher_2_native(buf->b_data, arc_buf_size(buf),
+	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
 	    hdr->b_l1hdr.b_freeze_cksum);
 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 	arc_buf_watch(buf);
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c
index e487e469f..2ec41fb51 100644
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -3814,7 +3814,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
-		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL, NULL,
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c
index 7d2383968..09a3536f5 100644
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -62,7 +62,8 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
 	spa_t *spa = ddt->ddt_spa;
 	objset_t *os = ddt->ddt_os;
 	uint64_t *objectp = &ddt->ddt_object[type][class];
-	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP;
 	char name[DDT_NAMELEN];
 
 	ddt_object_name(ddt, type, class, name);
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c
index d2f4aac98..80185706c 100644
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -1445,7 +1445,8 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
-			ASSERT(zio_checksum_table[chksum].ci_dedup);
+			ASSERT(zio_checksum_table[chksum].ci_flags &
+			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
@@ -1792,8 +1793,10 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
-		if (zio_checksum_table[checksum].ci_correctable < 1 ||
-		    zio_checksum_table[checksum].ci_eck)
+		if (!(zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_METADATA) ||
+		    (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
@@ -1832,17 +1835,20 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
-			if (!zio_checksum_table[checksum].ci_dedup)
+			if (!(zio_checksum_table[checksum].ci_flags &
+			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
-		 * Enable nopwrite if we have a cryptographically secure
-		 * checksum that has no known collisions (i.e. SHA-256)
-		 * and compression is enabled.  We don't enable nopwrite if
-		 * dedup is enabled as the two features are mutually exclusive.
+		 * Enable nopwrite if we have secure enough checksum
+		 * algorithm (see comment in zio_nop_write) and
+		 * compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually
+		 * exclusive.
 		 */
-		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c
index 5e95da52d..f9414ea3a 100644
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -346,7 +346,8 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
 		drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 	} else {
 		drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
-		if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+		if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)
 			drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
 		DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 		DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c
index 3026d8733..9362d49bd 100644
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -56,6 +56,7 @@
 #include <sys/dmu_send.h>
 #include <sys/zio_compress.h>
 #include <zfs_fletcher.h>
+#include <sys/zio_checksum.h>
 
 /*
  * The SPA supports block sizes up to 16MB.  However, very large blocks
@@ -108,6 +109,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	int used, compressed, uncompressed;
 	int64_t delta;
+	spa_feature_t f;
 
 	used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	compressed = BP_GET_PSIZE(bp);
@@ -134,10 +136,16 @@ dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 	dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
 	dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
 	dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
 	if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
 		ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
 		    B_TRUE;
 	}
+
+	f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+	if (f != SPA_FEATURE_NONE)
+		ds->ds_feature_activation_needed[f] = B_TRUE;
+
 	mutex_exit(&ds->ds_lock);
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
 	    compressed, uncompressed, tx);
diff --git a/module/zfs/edonr_zfs.c b/module/zfs/edonr_zfs.c
new file mode 100644
index 000000000..3c7d98656
--- /dev/null
+++ b/module/zfs/edonr_zfs.c
@@ -0,0 +1,103 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ * Use is subject to license terms.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/edonr.h>
+#include <sys/zfs_context.h>	/* For CTASSERT() */
+
+#define	EDONR_MODE		512
+#define	EDONR_BLOCK_SIZE	EdonR512_BLOCK_SIZE
+
+/*
+ * Native zio_checksum interface for the Edon-R hash function.
+ */
+/*ARGSUSED*/
+void
+zio_checksum_edonr_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	uint8_t		digest[EDONR_MODE / 8];
+	EdonRState	ctx;
+
+	ASSERT(ctx_template != NULL);
+	bcopy(ctx_template, &ctx, sizeof (ctx));
+	EdonRUpdate(&ctx, buf, size * 8);
+	EdonRFinal(&ctx, digest);
+	bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word));
+}
+
+/*
+ * Byteswapped zio_checksum interface for the Edon-R hash function.
+ */
+void
+zio_checksum_edonr_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	zio_checksum_edonr_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]);
+}
+
+void *
+zio_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	EdonRState	*ctx;
+	uint8_t		salt_block[EDONR_BLOCK_SIZE];
+
+	/*
+	 * Edon-R needs all but the last hash invocation to be on full-size
+	 * blocks, but the salt is too small. Rather than simply padding it
+	 * with zeros, we expand the salt into a new salt block of proper
+	 * size by double-hashing it (the new salt block will be composed of
+	 * H(salt) || H(H(salt))).
+	 */
+	CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8));
+	EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8,
+	    salt_block);
+	EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block +
+	    EDONR_MODE / 8);
+
+	/*
+	 * Feed the new salt block into the hash function - this will serve
+	 * as our MAC key.
+	 */
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	EdonRInit(ctx, EDONR_MODE);
+	EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8);
+	return (ctx);
+}
+
+void
+zio_checksum_edonr_tmpl_free(void *ctx_template)
+{
+	EdonRState	*ctx = ctx_template;
+
+	bzero(ctx, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c
index 57f5b7daf..c8a4882f8 100644
--- a/module/zfs/sha256.c
+++ b/module/zfs/sha256.c
@@ -19,110 +19,64 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
+#include <sys/sha2.h>
 
-/*
- * SHA-256 checksum, as specified in FIPS 180-3, available at:
- * http://csrc.nist.gov/publications/PubsFIPS.html
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
- *
- * 	Ch(x, y, z)     (x & y) ^ (~x & z)
- * 	Maj(x, y, z)    (x & y) ^ (x & z) ^ (y & z)
- *
- * We use equivalent logical reductions here that require one less op.
- */
-#define	Ch(x, y, z)	((z) ^ ((x) & ((y) ^ (z))))
-#define	Maj(x, y, z)	(((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define	Rot32(x, s)	(((x) >> s) | ((x) << (32 - s)))
-#define	SIGMA0(x)	(Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define	SIGMA1(x)	(Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define	sigma0(x)	(Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define	sigma1(x)	(Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
+/*ARGSUSED*/
+void
+zio_checksum_SHA256(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
-	uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
-	for (t = 0; t < 16; t++, cp += 4)
-		W[t] = ((uint32_t)cp[0] << 24) | ((uint32_t)cp[1] << 16) |
-		    ((uint32_t)cp[2] << 8) | (uint32_t)cp[3];
-
-	for (t = 16; t < 64; t++)
-		W[t] = sigma1(W[t - 2]) + W[t - 7] +
-		    sigma0(W[t - 15]) + W[t - 16];
-
-	a = H[0]; b = H[1]; c = H[2]; d = H[3];
-	e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
-	for (t = 0; t < 64; t++) {
-		T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
-		T2 = SIGMA0(a) + Maj(a, b, c);
-		h = g; g = f; f = e; e = d + T1;
-		d = c; c = b; b = a; a = T1 + T2;
-	}
-
-	H[0] += a; H[1] += b; H[2] += c; H[3] += d;
-	H[4] += e; H[5] += f; H[6] += g; H[7] += h;
+	SHA2_CTX ctx;
+	zio_cksum_t tmp;
+
+	SHA2Init(SHA256, &ctx);
+	SHA2Update(&ctx, buf, size);
+	SHA2Final(&tmp, &ctx);
+
+	/*
+	 * A prior implementation of this function had a
+	 * private SHA256 implementation always wrote things out in
+	 * Big Endian and there wasn't a byteswap variant of it.
+	 * To preseve on disk compatibility we need to force that
+	 * behaviour.
+	 */
+	zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
 }
 
+/*ARGSUSED*/
 void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_SHA512_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
-	uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-	    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-	uint8_t pad[128];
-	int i, padsize;
+	SHA2_CTX	ctx;
 
-	for (i = 0; i < (size & ~63ULL); i += 64)
-		SHA256Transform(H, (uint8_t *)buf + i);
-
-	for (padsize = 0; i < size; i++)
-		pad[padsize++] = *((uint8_t *)buf + i);
-
-	for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
-		pad[padsize] = 0;
-
-	for (i = 56; i >= 0; i -= 8)
-		pad[padsize++] = (size << 3) >> i;
+	SHA2Init(SHA512_256, &ctx);
+	SHA2Update(&ctx, buf, size);
+	SHA2Final(zcp, &ctx);
+}
 
-	for (i = 0; i < padsize; i += 64)
-		SHA256Transform(H, pad + i);
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
 
-	ZIO_SET_CHECKSUM(zcp,
-	    (uint64_t)H[0] << 32 | H[1],
-	    (uint64_t)H[2] << 32 | H[3],
-	    (uint64_t)H[4] << 32 | H[5],
-	    (uint64_t)H[6] << 32 | H[7]);
+	zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
 }
diff --git a/module/zfs/skein_zfs.c b/module/zfs/skein_zfs.c
new file mode 100644
index 000000000..659234039
--- /dev/null
+++ b/module/zfs/skein_zfs.c
@@ -0,0 +1,91 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2013 Saso Kiselkov.  All rights reserved.
+ */
+#include <sys/zfs_context.h>
+#include <sys/zio.h>
+#include <sys/skein.h>
+
+/*
+ * Computes a native 256-bit skein MAC checksum. Please note that this
+ * function requires the presence of a ctx_template that should be allocated
+ * using zio_checksum_skein_tmpl_init.
+ */
+/*ARGSUSED*/
+void
+zio_checksum_skein_native(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	Skein_512_Ctxt_t	ctx;
+
+	ASSERT(ctx_template != NULL);
+	bcopy(ctx_template, &ctx, sizeof (ctx));
+	(void) Skein_512_Update(&ctx, buf, size);
+	(void) Skein_512_Final(&ctx, (uint8_t *)zcp);
+	bzero(&ctx, sizeof (ctx));
+}
+
+/*
+ * Byteswapped version of zio_checksum_skein_native. This just invokes
+ * the native checksum function and byteswaps the resulting checksum (since
+ * skein is internally endian-insensitive).
+ */
+void
+zio_checksum_skein_byteswap(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
+{
+	zio_cksum_t	tmp;
+
+	zio_checksum_skein_native(buf, size, ctx_template, &tmp);
+	zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+
+/*
+ * Allocates a skein MAC template suitable for using in skein MAC checksum
+ * computations and returns a pointer to it.
+ */
+void *
+zio_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt)
+{
+	Skein_512_Ctxt_t	*ctx;
+
+	ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP);
+	(void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0,
+	    salt->zcs_bytes, sizeof (salt->zcs_bytes));
+	return (ctx);
+}
+
+/*
+ * Frees a skein context template previously allocated using
+ * zio_checksum_skein_tmpl_init.
+ */
+void
+zio_checksum_skein_tmpl_free(void *ctx_template)
+{
+	Skein_512_Ctxt_t	*ctx = ctx_template;
+
+	bzero(ctx, sizeof (*ctx));
+	kmem_free(ctx, sizeof (*ctx));
+}
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 3264bfb10..c2f914e11 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -25,6 +25,7 @@
  * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  */
 
@@ -2675,6 +2676,19 @@ spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
+	/* Grab the checksum salt from the MOS. */
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT, 1,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes),
+	    spa->spa_cksum_salt.zcs_bytes);
+	if (error == ENOENT) {
+		/* Generate a new salt for subsequent use */
+		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes));
+	} else if (error != 0) {
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	}
+
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
@@ -3930,6 +3944,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 		spa_history_create_obj(spa, tx);
 
 	/*
+	 * Generate some random noise for salted checksums to operate on.
+	 */
+	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+	    sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
@@ -6406,6 +6426,20 @@ spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
+
+	/*
+	 * If we haven't written the salt, do so now.  Note that the
+	 * feature may not be activated yet, but that's fine since
+	 * the presence of this ZAP entry is backwards compatible.
+	 */
+	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+		VERIFY0(zap_add(spa->spa_meta_objset,
+		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+		    sizeof (spa->spa_cksum_salt.zcs_bytes),
+		    spa->spa_cksum_salt.zcs_bytes, tx));
+	}
+
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c
index 6330a6a6b..595e594ca 100644
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -23,6 +23,7 @@
  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -53,7 +54,7 @@
 #include <sys/ddt.h>
 #include <sys/kstat.h>
 #include "zfs_prop.h"
-#include "zfeature_common.h"
+#include <sys/zfeature.h>
 
 /*
  * SPA locking
@@ -558,6 +559,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -686,6 +688,8 @@ spa_remove(spa_t *spa)
 	for (t = 0; t < TXG_SIZE; t++)
 		bplist_destroy(&spa->spa_free_bplist[t]);
 
+	zio_checksum_templates_free(spa);
+
 	cv_destroy(&spa->spa_async_cv);
 	cv_destroy(&spa->spa_evicting_os_cv);
 	cv_destroy(&spa->spa_proc_cv);
@@ -699,6 +703,7 @@ spa_remove(spa_t *spa)
 	mutex_destroy(&spa->spa_history_lock);
 	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_cksum_tmpls_lock);
 	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
 	mutex_destroy(&spa->spa_vdev_top_lock);
diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c
index f5df2c7d8..d1b415367 100644
--- a/module/zfs/vdev_raidz.c
+++ b/module/zfs/vdev_raidz.c
@@ -1604,6 +1604,13 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 	int c, ret = 0;
 	raidz_col_t *rc;
 
+	blkptr_t *bp = zio->io_bp;
+	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+	if (checksum == ZIO_CHECKSUM_NOPARITY)
+		return (ret);
+
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
diff --git a/module/zfs/zfeature_common.c b/module/zfs/zfeature_common.c
index 3264f6235..e8b0a16ae 100644
--- a/module/zfs/zfeature_common.c
+++ b/module/zfs/zfeature_common.c
@@ -253,4 +253,16 @@ zpool_feature_init(void)
 	    "Variable on-disk size of dnodes.",
 	    ZFEATURE_FLAG_PER_DATASET, large_dnode_deps);
 	}
+	zfeature_register(SPA_FEATURE_SHA512,
+	    "org.illumos:sha512", "sha512",
+	    "SHA-512/256 hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, NULL);
+	zfeature_register(SPA_FEATURE_SKEIN,
+	    "org.illumos:skein", "skein",
+	    "Skein hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, NULL);
+	zfeature_register(SPA_FEATURE_EDONR,
+	    "org.illumos:edonr", "edonr",
+	    "Edon-R hash algorithm.",
+	    ZFEATURE_FLAG_PER_DATASET, NULL);
 }
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c
index 64f630108..9140c62a6 100644
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -185,6 +185,7 @@
 #include <sys/dsl_bookmark.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
 
 #include <linux/miscdevice.h>
 #include <linux/slab.h>
@@ -3809,11 +3810,6 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 			return (SET_ERROR(ENOTSUP));
 		break;
 
-	case ZFS_PROP_DEDUP:
-		if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
-			return (SET_ERROR(ENOTSUP));
-		break;
-
 	case ZFS_PROP_VOLBLOCKSIZE:
 	case ZFS_PROP_RECORDSIZE:
 		/* Record sizes above 128k need the feature to be enabled */
@@ -3893,6 +3889,47 @@ zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
 				return (SET_ERROR(ENOTSUP));
 		}
 		break;
+	case ZFS_PROP_CHECKSUM:
+	case ZFS_PROP_DEDUP:
+	{
+		spa_feature_t feature;
+		spa_t *spa;
+		uint64_t intval;
+		int err;
+
+		/* dedup feature version checks */
+		if (prop == ZFS_PROP_DEDUP &&
+		    zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+			return (SET_ERROR(ENOTSUP));
+
+		if (nvpair_value_uint64(pair, &intval) != 0)
+			return (SET_ERROR(EINVAL));
+
+		/* check prop value is enabled in features */
+		feature = zio_checksum_to_feature(intval);
+		if (feature == SPA_FEATURE_NONE)
+			break;
+
+		if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+			return (err);
+		/*
+		 * Salted checksums are not supported on root pools.
+		 */
+		if (spa_bootfs(spa) != 0 &&
+		    intval < ZIO_CHECKSUM_FUNCTIONS &&
+		    (zio_checksum_table[intval].ci_flags &
+		    ZCHECKSUM_FLAG_SALTED)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ERANGE));
+		}
+		if (!spa_feature_is_enabled(spa, feature)) {
+			spa_close(spa, FTAG);
+			return (SET_ERROR(ENOTSUP));
+		}
+		spa_close(spa, FTAG);
+		break;
+	}
+
 	default:
 		break;
 	}
diff --git a/module/zfs/zio.c b/module/zfs/zio.c
index e26822e34..8a063ab7f 100644
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -979,7 +979,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 
 	zio->io_prop.zp_checksum = checksum;
 
-	if (zio_checksum_table[checksum].ci_eck) {
+	if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		/*
 		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
@@ -1190,8 +1190,8 @@ zio_write_bp_init(zio_t *zio)
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
-		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
-		    zp->zp_dedup_verify);
+		ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
 
 		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 			BP_SET_DEDUP(bp, 1);
@@ -2198,12 +2198,22 @@ zio_write_gang_block(zio_t *pio)
 }
 
 /*
- * The zio_nop_write stage in the pipeline determines if allocating
- * a new bp is necessary.  By leveraging a cryptographically secure checksum,
- * such as SHA256, we can compare the checksums of the new data and the old
- * to determine if allocating a new block is required.  The nopwrite
- * feature can handle writes in either syncing or open context (i.e. zil
- * writes) and as a result is mutually exclusive with dedup.
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary.  The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required.  Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions.  To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
  */
 static int
 zio_nop_write(zio_t *zio)
@@ -2226,7 +2236,8 @@ zio_nop_write(zio_t *zio)
 	 * allocate a new bp.
 	 */
 	if (BP_IS_HOLE(bp_orig) ||
-	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+	    !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+	    ZCHECKSUM_FLAG_NOPWRITE) ||
 	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
 	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
 	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
@@ -2238,7 +2249,8 @@ zio_nop_write(zio_t *zio)
 	 * avoid allocating a new bp and issuing any I/O.
 	 */
 	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
-		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_NOPWRITE);
 		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
 		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
 		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
@@ -2566,7 +2578,8 @@ zio_ddt_write(zio_t *zio)
 		 * we can't resolve it, so just convert to an ordinary write.
 		 * (And automatically e-mail a paper to Nature?)
 		 */
-		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+		if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+		    ZCHECKSUM_FLAG_DEDUP)) {
 			zp->zp_checksum = spa_dedup_checksum(spa);
 			zio_pop_transforms(zio);
 			zio->io_stage = ZIO_STAGE_OPEN;
diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c
index b05e787dc..59871c50e 100644
--- a/module/zfs/zio_checksum.c
+++ b/module/zfs/zio_checksum.c
@@ -21,10 +21,12 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
+#include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/zil.h>
@@ -58,28 +60,96 @@
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
  * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really).  A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed.  This salt is kept secret (stored on the pool, but
+ * never shown to the user).  Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time.  How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data.  Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context.  The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
  */
 
 /*ARGSUSED*/
 static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_off(const void *buf, uint64_t size,
+    const void *ctx_template, zio_cksum_t *zcp)
 {
 	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL,			NULL},			0, 0, 0, "inherit"},
-	{{NULL,			NULL},			0, 0, 0, "on"},
-	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
+	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
+	{{NULL, NULL}, NULL, NULL, 0, "on"},
+	{{zio_checksum_off,		zio_checksum_off},
+	    NULL, NULL, 0, "off"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "label"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+	    "gang_header"},
+	{{fletcher_2_native,		fletcher_2_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+	{{fletcher_2_native,		fletcher_2_byteswap},
+	    NULL, NULL, 0, "fletcher2"},
+	{{fletcher_4_native,		fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+	{{zio_checksum_SHA256,		zio_checksum_SHA256},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+	{{fletcher_4_native,		fletcher_4_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+	{{zio_checksum_off,		zio_checksum_off},
+	    NULL, NULL, 0, "noparity"},
+	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
+	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+	{{zio_checksum_skein_native,	zio_checksum_skein_byteswap},
+	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+	{{zio_checksum_edonr_native,	zio_checksum_edonr_byteswap},
+	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
 };
 
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+	switch (cksum) {
+	case ZIO_CHECKSUM_SHA512:
+		return (SPA_FEATURE_SHA512);
+	case ZIO_CHECKSUM_SKEIN:
+		return (SPA_FEATURE_SKEIN);
+	case ZIO_CHECKSUM_EDONR:
+		return (SPA_FEATURE_EDONR);
+	default:
+		return (SPA_FEATURE_NONE);
+	}
+}
+
 enum zio_checksum
 zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
@@ -113,7 +183,8 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
 	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
 		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
 
-	ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+	    ZCHECKSUM_FLAG_DEDUP) ||
 	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
 
 	return (child);
@@ -146,6 +217,30 @@ zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
 }
 
 /*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+	if (ci->ci_tmpl_init == NULL)
+		return;
+	if (spa->spa_cksum_tmpls[checksum] != NULL)
+		return;
+
+	VERIFY(ci->ci_tmpl_free != NULL);
+	mutex_enter(&spa->spa_cksum_tmpls_lock);
+	if (spa->spa_cksum_tmpls[checksum] == NULL) {
+		spa->spa_cksum_tmpls[checksum] =
+		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
+		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+	}
+	mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/*
  * Generate the checksum.
  */
 void
@@ -156,11 +251,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 	uint64_t offset = zio->io_offset;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t cksum;
+	spa_t *spa = zio->io_spa;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
-	if (ci->ci_eck) {
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
 
 		if (checksum == ZIO_CHECKSUM_ZILOG2) {
@@ -179,10 +277,12 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 		else
 			bp->blk_cksum = eck->zec_cksum;
 		eck->zec_magic = ZEC_MAGIC;
-		ci->ci_func[0](data, size, &cksum);
+		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		    &cksum);
 		eck->zec_cksum = cksum;
 	} else {
-		ci->ci_func[0](data, size, &bp->blk_cksum);
+		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+		    &bp->blk_cksum);
 	}
 }
 
@@ -191,13 +291,15 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
     void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
 {
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t actual_cksum, expected_cksum;
 	int byteswap;
+	zio_cksum_t actual_cksum, expected_cksum;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (SET_ERROR(EINVAL));
 
-	if (ci->ci_eck) {
+	zio_checksum_template_init(checksum, spa);
+
+	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
 		zio_eck_t *eck;
 		zio_cksum_t verifier;
 
@@ -235,7 +337,8 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
 
 		expected_cksum = eck->zec_cksum;
 		eck->zec_cksum = verifier;
-		ci->ci_func[byteswap](data, size, &actual_cksum);
+		ci->ci_func[byteswap](data, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 		eck->zec_cksum = expected_cksum;
 
 		if (byteswap) {
@@ -245,7 +348,8 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
 	} else {
 		byteswap = BP_SHOULD_BYTESWAP(bp);
 		expected_cksum = bp->blk_cksum;
-		ci->ci_func[byteswap](data, size, &actual_cksum);
+		ci->ci_func[byteswap](data, size,
+		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
 	}
 
 	if (info != NULL) {
@@ -286,3 +390,24 @@ zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 	}
 	return (error);
 }
+
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+	enum zio_checksum checksum;
+	for (checksum = 0; checksum < ZIO_CHECKSUM_FUNCTIONS;
+	    checksum++) {
+		if (spa->spa_cksum_tmpls[checksum] != NULL) {
+			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+			VERIFY(ci->ci_tmpl_free != NULL);
+			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+			spa->spa_cksum_tmpls[checksum] = NULL;
+		}
+	}
+}
diff --git a/scripts/zfs2zol-patch.sed b/scripts/zfs2zol-patch.sed
index e6fc5c8b0..d4def4429 100755
--- a/scripts/zfs2zol-patch.sed
+++ b/scripts/zfs2zol-patch.sed
@@ -19,3 +19,22 @@ s:usr/src/test/zfs-tests/runfiles:tests/runfiles:g
 s:usr/src/test/zfs-tests/tests/functional:tests/zfs-tests/tests/functional:g
 s:usr/src/test/zfs-tests/tests/perf:tests/zfs-tests/tests/perf:g
 s:usr/src/test/test-runner/cmd/run.py:tests/test-runner/cmd/test-runner.py:g
+
+#
+# The usr/src/common/zfs/ files go in a couple different dirs.
+# usr/src/common/zfs/zfeature_common.c goes in module/zfs
+#
+s:usr/src/common/zfs/zfeature_common.c:module/zfs/zfeature_common.c:g
+
+# ...but most of the rest of the C files go in module/zcommon
+s/usr\/src\/common\/zfs\/\(.*\)\.c/module\/zcommon\/\1.c/g
+
+# crypto framework
+s:usr/src/common/crypto:module/icp/algs:g
+s:usr/src/uts/common/crypto/io:module/icp/io:g
+
+# Headers
+s:usr/src/common/zfs/\(.*\)\.h:include/\1.h:g
+
+# Man pages
+s:usr/src/man:man:g
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
index 9a85af5d6..92f867ab9 100644
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -60,6 +60,9 @@ tests = ['cache_002_pos', 'cache_003_pos', 'cache_004_neg',
 [tests/functional/casenorm]
 tests = ['case_all_values', 'norm_all_values']
 
+[tests/functional/checksum]
+tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'filetest_001_pos']
+
 [tests/functional/clean_mirror]
 tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos',
     'clean_mirror_003_pos', 'clean_mirror_004_pos']
diff --git a/tests/zfs-tests/cmd/file_write/file_write.c b/tests/zfs-tests/cmd/file_write/file_write.c
index 046794820..81fc5de39 100644
--- a/tests/zfs-tests/cmd/file_write/file_write.c
+++ b/tests/zfs-tests/cmd/file_write/file_write.c
@@ -30,6 +30,9 @@
 #include <inttypes.h>
 #include <sys/types.h>
 #include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <stdint.h>
 
 typedef unsigned char	uchar_t;
 typedef long long	longlong_t;
@@ -44,6 +47,16 @@ static unsigned char bigbuffer[BIGBUFFERSIZE];
 
 static void usage(char *);
 
+/*
+ * psudo-randomize the buffer
+ */
+void randomize_buffer(int block_size) {
+	int i;
+	char rnd = rand() & 0xff;
+	for (i = 0; i < block_size; i++)
+		bigbuffer[i] ^= rnd;
+}
+
 int
 main(int argc, char **argv)
 {
@@ -81,7 +94,10 @@ main(int argc, char **argv)
 				write_count = atoi(optarg);
 				break;
 			case 'd':
-				fillchar = atoi(optarg);
+				if (optarg[0] == 'R')
+					fillchar = 'R'; /* R = random data */
+				else
+					fillchar = atoi(optarg);
 				break;
 			case 's':
 				offset = atoll(optarg);
@@ -138,6 +154,9 @@ main(int argc, char **argv)
 	nxtfillchar = fillchar;
 	k = 0;
 
+	if (fillchar == 'R')
+		srand(time(NULL));
+
 	for (i = 0; i < block_size; i++) {
 		bigbuffer[i] = nxtfillchar;
 
@@ -146,6 +165,8 @@ main(int argc, char **argv)
 				k = 0;
 			}
 			nxtfillchar = k++;
+		} else if (fillchar == 'R') {
+			nxtfillchar = rand() & 0xff;
 		}
 	}
 
@@ -191,14 +212,21 @@ main(int argc, char **argv)
 
 	if (verbose) {
 		(void) printf("%s: block_size = %d, write_count = %d, "
-		    "offset = %lld, data = %s%d\n", filename, block_size,
-		    write_count, offset,
-		    (fillchar == 0) ? "0->" : "",
-		    (fillchar == 0) ? DATA_RANGE : fillchar);
+		    "offset = %lld, ", filename, block_size,
+		    write_count, offset);
+		if (fillchar == 'R') {
+			(void) printf("data = [random]\n");
+		} else {
+			(void) printf("data = %s%d\n",
+			    (fillchar == 0) ? "0->" : "",
+			    (fillchar == 0) ? DATA_RANGE : fillchar);
+		}
 	}
 
 	for (i = 0; i < write_count; i++) {
 		ssize_t n;
+		if (fillchar == 'R')
+			randomize_buffer(block_size);
 
 		if ((n = write(bigfd, &bigbuffer, block_size)) == -1) {
 			(void) printf("write failed (%ld), good_writes = %"
@@ -224,9 +252,11 @@ usage(char *prog)
 {
 	(void) printf("Usage: %s [-v] -o {create,overwrite,append} -f file_name"
 	    " [-b block_size]\n"
-	    "\t[-s offset] [-c write_count] [-d data]\n"
-	    "\twhere [data] equal to zero causes chars "
-	    "0->%d to be repeated throughout\n", prog, DATA_RANGE);
+	    "\t[-s offset] [-c write_count] [-d data]\n\n"
+	    "Where [data] equal to zero causes chars "
+	    "0->%d to be repeated throughout, or [data]\n"
+	    "equal to 'R' for psudorandom data.\n",
+	    prog, DATA_RANGE);
 
 	exit(1);
 }
diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib
index 37f173e12..62ba3a9eb 100644
--- a/tests/zfs-tests/include/libtest.shlib
+++ b/tests/zfs-tests/include/libtest.shlib
@@ -147,12 +147,14 @@ function default_setup_noexit
 	typeset disklist=$1
 	typeset container=$2
 	typeset volume=$3
+	log_note begin default_setup_noexit
 
 	if is_global_zone; then
 		if poolexists $TESTPOOL ; then
 			destroy_pool $TESTPOOL
 		fi
 		[[ -d /$TESTPOOL ]] && $RM -rf /$TESTPOOL
+		log_note creating pool $TESTPOOL $disklist
 		log_must $ZPOOL create -f $TESTPOOL $disklist
 	else
 		reexport_pool
@@ -1539,6 +1541,18 @@ function get_disklist # pool
 	$ECHO $disklist
 }
 
+#
+# Given a pool, and this function list all disks in the pool with their full
+# path (like "/dev/sda" instead of "sda").
+#
+function get_disklist_fullpath # pool
+{
+	args="-P $1"
+	get_disklist $args
+}
+
+
+
 # /**
 #  This function kills a given list of processes after a time period. We use
 #  this in the stress tests instead of STF_TIMEOUT so that we can have processes
diff --git a/tests/zfs-tests/include/properties.shlib b/tests/zfs-tests/include/properties.shlib
index bb0b4ff58..c495eecb4 100644
--- a/tests/zfs-tests/include/properties.shlib
+++ b/tests/zfs-tests/include/properties.shlib
@@ -16,7 +16,8 @@
 typeset -a compress_props=('on' 'off' 'lzjb' 'gzip' 'gzip-1' 'gzip-2' 'gzip-3'
     'gzip-4' 'gzip-5' 'gzip-6' 'gzip-7' 'gzip-8' 'gzip-9' 'zle')
 
-typeset -a checksum_props=('on' 'off' 'fletcher2' 'fletcher4' 'sha256')
+typeset -a checksum_props=('on' 'off' 'fletcher2' 'fletcher4' 'sha256' 'sha512'
+    'edonr' 'skein' 'noparity')
 
 #
 # Given the property array passed in, return 'num_props' elements to the
diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am
index 79d33a14b..ed01eafb4 100644
--- a/tests/zfs-tests/tests/functional/Makefile.am
+++ b/tests/zfs-tests/tests/functional/Makefile.am
@@ -5,6 +5,7 @@ SUBDIRS = \
 	cache \
 	cachefile \
 	casenorm \
+	checksum \
 	clean_mirror \
 	cli_root \
 	cli_user \
diff --git a/tests/zfs-tests/tests/functional/checksum/.gitignore b/tests/zfs-tests/tests/functional/checksum/.gitignore
new file mode 100644
index 000000000..0411d5aa4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/.gitignore
@@ -0,0 +1,4 @@
+skein_test
+edonr_test
+sha2_test
+
diff --git a/tests/zfs-tests/tests/functional/checksum/Makefile.am b/tests/zfs-tests/tests/functional/checksum/Makefile.am
new file mode 100644
index 000000000..2d7d271a0
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/Makefile.am
@@ -0,0 +1,26 @@
+include $(top_srcdir)/config/Rules.am
+AM_CPPFLAGS += -I$(top_srcdir)/include
+LDADD = $(top_srcdir)/lib/libicp/libicp.la
+
+AUTOMAKE_OPTIONS = subdir-objects
+
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/checksum
+
+dist_pkgdata_SCRIPTS = \
+	setup.ksh \
+	cleanup.ksh \
+	run_edonr_test.ksh \
+	run_sha2_test.ksh \
+	run_skein_test.ksh \
+	filetest_001_pos.ksh
+
+pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/checksum
+
+pkgexec_PROGRAMS = \
+	edonr_test \
+	skein_test \
+	sha2_test
+
+edonr_test_SOURCES = edonr_test.c
+skein_test_SOURCES = skein_test.c
+sha2_test_SOURCES = sha2_test.c
diff --git a/tests/zfs-tests/tests/functional/checksum/cleanup.ksh b/tests/zfs-tests/tests/functional/checksum/cleanup.ksh
new file mode 100755
index 000000000..79cd6e9f9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/cleanup.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/checksum/edonr_test.c b/tests/zfs-tests/tests/functional/checksum/edonr_test.c
new file mode 100644
index 000000000..1ea8e991e
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/edonr_test.c
@@ -0,0 +1,219 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * This is just to keep the compiler happy about sys/time.h not declaring
+ * gettimeofday due to -D_KERNEL (we can do this since we're actually
+ * running in userspace, but we need -D_KERNEL for the remaining Edon-R code).
+ */
+#ifdef	_KERNEL
+#undef	_KERNEL
+#endif
+
+#include <sys/edonr.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/time.h>
+#define NOTE(x) 
+typedef enum boolean { B_FALSE, B_TRUE } boolean_t;
+typedef	unsigned long long	u_longlong_t;
+
+/*
+ * Test messages from:
+ * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/SHA_All.pdf
+ */
+const char	*test_msg0 = "abc";
+const char	*test_msg1 = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmn"
+	"lmnomnopnopq";
+const char	*test_msg2 = "abcdefghbcdefghicdefghijdefghijkefghijklfghi"
+	"jklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu";
+
+/*
+ * Test digests computed by hand. There's no formal standard or spec for edonr.
+ */
+const uint8_t	edonr_224_test_digests[][28] = {
+	{
+		/* for test_msg0 */
+		0x56, 0x63, 0xc4, 0x93, 0x95, 0x20, 0xfa, 0xf6,
+		0x12, 0x31, 0x65, 0xa4, 0x66, 0xf2, 0x56, 0x01,
+		0x95, 0x2e, 0xa9, 0xe4, 0x24, 0xdd, 0xc9, 0x6b,
+		0xef, 0xd0, 0x40, 0x94
+	},
+	{
+		/* for test_msg1 */
+		0xd0, 0x13, 0xe4, 0x87, 0x4d, 0x06, 0x8d, 0xca,
+		0x4e, 0x14, 0xb9, 0x37, 0x2f, 0xce, 0x12, 0x20,
+		0x60, 0xf8, 0x5c, 0x0a, 0xfd, 0x7a, 0x7d, 0x97,
+		0x88, 0x2b, 0x05, 0x75
+	}
+	/* no test vector for test_msg2 */
+};
+
+const uint8_t	edonr_256_test_digests[][32] = {
+	{
+		/* for test_msg0 */
+		0x54, 0xd7, 0x8b, 0x13, 0xc7, 0x4e, 0xda, 0x5a,
+		0xed, 0xc2, 0x71, 0xcc, 0x88, 0x1f, 0xb2, 0x2f,
+		0x83, 0x99, 0xaf, 0xd3, 0x04, 0x0b, 0x6a, 0x39,
+		0x2d, 0x73, 0x94, 0x05, 0x50, 0x8d, 0xd8, 0x51
+	},
+	{
+		/* for test_msg1 */
+		0x49, 0x2d, 0x0b, 0x19, 0xab, 0x1e, 0xde, 0x3a,
+		0xea, 0x9b, 0xf2, 0x39, 0x3a, 0xb1, 0x21, 0xde,
+		0x21, 0xf6, 0x80, 0x1f, 0xad, 0xbe, 0x8b, 0x07,
+		0xc7, 0xfb, 0xe6, 0x99, 0x0e, 0x4d, 0x73, 0x63
+	}
+	/* no test vectorfor test_msg2 */
+};
+
+const uint8_t	edonr_384_test_digests[][48] = {
+	{
+		/* for test_msg0 */
+		0x0e, 0x7c, 0xd7, 0x85, 0x78, 0x77, 0xe0, 0x89,
+		0x5b, 0x1c, 0xdf, 0x49, 0xf4, 0x1d, 0x20, 0x9c,
+		0x72, 0x7d, 0x2e, 0x57, 0x9b, 0x9b, 0x9a, 0xdc,
+		0x60, 0x27, 0x97, 0x82, 0xb9, 0x90, 0x72, 0xec,
+		0x7e, 0xce, 0xd3, 0x16, 0x5f, 0x47, 0x75, 0x48,
+		0xfa, 0x60, 0x72, 0x7e, 0x01, 0xc7, 0x7c, 0xc6
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* for test_msg2 */
+		0xe2, 0x34, 0xa1, 0x02, 0x83, 0x76, 0xae, 0xe6,
+		0x82, 0xd9, 0x38, 0x32, 0x0e, 0x00, 0x78, 0xd2,
+		0x34, 0xdb, 0xb9, 0xbd, 0xf0, 0x08, 0xa8, 0x0f,
+		0x63, 0x1c, 0x3d, 0x4a, 0xfd, 0x0a, 0xe9, 0x59,
+		0xdc, 0xd4, 0xce, 0xcd, 0x8d, 0x67, 0x6c, 0xea,
+		0xbb, 0x1a, 0x32, 0xed, 0x5c, 0x6b, 0xf1, 0x7f
+	}
+};
+
+const uint8_t	edonr_512_test_digests[][64] = {
+	{
+		/* for test_msg0 */
+		0x1b, 0x14, 0xdb, 0x15, 0x5f, 0x1d, 0x40, 0x65,
+		0x94, 0xb8, 0xce, 0xf7, 0x0a, 0x43, 0x62, 0xec,
+		0x6b, 0x5d, 0xe6, 0xa5, 0xda, 0xf5, 0x0e, 0xc9,
+		0x99, 0xe9, 0x87, 0xc1, 0x9d, 0x30, 0x49, 0xe2,
+		0xde, 0x59, 0x77, 0xbb, 0x05, 0xb1, 0xbb, 0x22,
+		0x00, 0x50, 0xa1, 0xea, 0x5b, 0x46, 0xa9, 0xf1,
+		0x74, 0x0a, 0xca, 0xfb, 0xf6, 0xb4, 0x50, 0x32,
+		0xad, 0xc9, 0x0c, 0x62, 0x83, 0x72, 0xc2, 0x2b
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* for test_msg2 */
+		0x53, 0x51, 0x07, 0x0d, 0xc5, 0x1c, 0x3b, 0x2b,
+		0xac, 0xa5, 0xa6, 0x0d, 0x02, 0x52, 0xcc, 0xb4,
+		0xe4, 0x92, 0x1a, 0x96, 0xfe, 0x5a, 0x69, 0xe7,
+		0x6d, 0xad, 0x48, 0xfd, 0x21, 0xa0, 0x84, 0x5a,
+		0xd5, 0x7f, 0x88, 0x0b, 0x3e, 0x4a, 0x90, 0x7b,
+		0xc5, 0x03, 0x15, 0x18, 0x42, 0xbb, 0x94, 0x9e,
+		0x1c, 0xba, 0x74, 0x39, 0xa6, 0x40, 0x9a, 0x34,
+		0xb8, 0x43, 0x6c, 0xb4, 0x69, 0x21, 0x58, 0x3c
+	}
+};
+
+int
+main(int argc, char *argv[])
+{
+	boolean_t	failed = B_FALSE;
+	uint64_t	cpu_mhz = 0;
+
+	if (argc == 2)
+		cpu_mhz = atoi(argv[1]);
+
+#define	EDONR_ALGO_TEST(_m, mode, testdigest)				\
+	do {								\
+		EdonRState	ctx;					\
+		uint8_t		digest[mode / 8];			\
+		EdonRInit(&ctx, mode);					\
+		EdonRUpdate(&ctx, (const uint8_t *) _m, strlen(_m) * 8);\
+		EdonRFinal(&ctx, digest);				\
+		(void) printf("Edon-R-%-6sMessage: " #_m		\
+		    "\tResult: ", #mode);				\
+		if (bcmp(digest, testdigest, mode / 8) == 0) {		\
+			(void) printf("OK\n");				\
+		} else {						\
+			(void) printf("FAILED!\n");			\
+			failed = B_TRUE;				\
+		}							\
+		NOTE(CONSTCOND)						\
+	} while (0)
+
+#define	EDONR_PERF_TEST(mode)						\
+	do {								\
+		EdonRState	ctx;					\
+		uint8_t		digest[mode / 8];			\
+		uint8_t		block[131072];				\
+		uint64_t	delta;					\
+		double		cpb = 0;				\
+		int		i;					\
+		struct timeval	start, end;				\
+		bzero(block, sizeof (block));				\
+		(void) gettimeofday(&start, NULL);			\
+		EdonRInit(&ctx, mode);					\
+		for (i = 0; i < 8192; i++)				\
+			EdonRUpdate(&ctx, block, sizeof (block) * 8);	\
+		EdonRFinal(&ctx, digest);				\
+		(void) gettimeofday(&end, NULL);			\
+		delta = (end.tv_sec * 1000000llu + end.tv_usec) -	\
+		    (start.tv_sec * 1000000llu + start.tv_usec);	\
+		if (cpu_mhz != 0) {					\
+			cpb = (cpu_mhz * 1e6 * ((double)delta /		\
+			    1000000)) / (8192 * 128 * 1024);		\
+		}							\
+		(void) printf("Edon-R-%-6s%llu us (%.02f CPB)\n", #mode,\
+		    (u_longlong_t)delta, cpb);				\
+		NOTE(CONSTCOND)						\
+	} while (0)
+
+	(void) printf("Running algorithm correctness tests:\n");
+	EDONR_ALGO_TEST(test_msg0, 224, edonr_224_test_digests[0]);
+	EDONR_ALGO_TEST(test_msg1, 224, edonr_224_test_digests[1]);
+	EDONR_ALGO_TEST(test_msg0, 256, edonr_256_test_digests[0]);
+	EDONR_ALGO_TEST(test_msg1, 256, edonr_256_test_digests[1]);
+	EDONR_ALGO_TEST(test_msg0, 384, edonr_384_test_digests[0]);
+	EDONR_ALGO_TEST(test_msg2, 384, edonr_384_test_digests[2]);
+	EDONR_ALGO_TEST(test_msg0, 512, edonr_512_test_digests[0]);
+	EDONR_ALGO_TEST(test_msg2, 512, edonr_512_test_digests[2]);
+	if (failed)
+		return (1);
+
+	(void) printf("Running performance tests (hashing 1024 MiB of "
+	    "data):\n");
+	EDONR_PERF_TEST(256);
+	EDONR_PERF_TEST(512);
+
+	return (0);
+}
diff --git a/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh b/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh
new file mode 100755
index 000000000..758b353c9
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh
@@ -0,0 +1,125 @@
+#! /bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/include/properties.shlib
+
+# DESCRIPTION:
+# Sanity test to make sure checksum algorithms work.
+# For each checksum, create a file in the pool using that checksum.  Verify
+# that there are no checksum errors.  Next, for each checksum, create a single
+# file in the pool using that checksum, scramble the underlying vdev, and
+# verify that we correctly catch the checksum errors.
+#
+# STRATEGY:
+# Test 1
+# 1. Create a mirrored pool
+# 2. Create a file using each checksum
+# 3. Export/import/scrub the pool
+# 4. Verify there's no checksum errors.
+# 5. Clear the pool
+#
+# Test 2
+# 6. For each checksum:
+# 7.	Create a file using the checksum
+# 8.	Export the pool
+# 9.	Scramble the data on one of the underlying VDEVs
+# 10.	Import the pool
+# 11.	Scrub the pool
+# 12.	Verify that there are checksum errors
+
+verify_runnable "both"
+
+function cleanup
+{
+	$ECHO cleanup
+	[[ -e $TESTDIR ]] && \
+		log_must $RM -rf $TESTDIR/* > /dev/null 2>&1
+}
+
+log_assert "Create and read back files with using different checksum algorithms"
+
+log_onexit cleanup
+
+FSSIZE=$($ZPOOL list -Hp -o size $TESTPOOL)
+WRITESZ=1048576
+WRITECNT=$((($FSSIZE) / $WRITESZ ))
+# Skip the first and last 4MB
+SKIP=4127518
+SKIPCNT=$((($SKIP / $WRITESZ )))
+SKIPCNT=$((($SKIPCNT * 2)))
+WRITECNT=$((($WRITECNT - $SKIPCNT)))
+
+# Get a list of vdevs in our pool
+set -A array $(get_disklist_fullpath)
+
+# Get the first vdev, since we will corrupt it later
+firstvdev=${array[0]}
+
+# First test each checksum by writing a file using it, and confirm there's no
+# errors.
+for ((count = 0; count < ${#checksum_props[*]} ; count++)); do
+	i=${checksum_props[$count]}
+	$ZFS set checksum=$i $TESTPOOL
+	$FILE_WRITE -o overwrite -f $TESTDIR/test_$i -b $WRITESZ -c 5 -d R
+done
+$ZPOOL export $TESTPOOL
+$ZPOOL import $TESTPOOL
+$ZPOOL scrub $TESTPOOL
+while is_pool_scrubbing $TESTPOOL; do
+	$SLEEP 1
+done
+$ZPOOL status -P -v $TESTPOOL | grep $firstvdev | read -r name state rd wr cksum
+log_assert "Normal file write test saw: $cksum errors"
+log_must [ $cksum -eq 0 ]
+
+rm -fr $TESTDIR/*
+
+log_assert "Test scrambling the disk and seeing checksum errors"
+for ((count = 0; count < ${#checksum_props[*]} ; count++)); do
+	i=${checksum_props[$count]}
+	$ZFS set checksum=$i $TESTPOOL
+	$FILE_WRITE -o overwrite -f $TESTDIR/test_$i -b $WRITESZ -c 5 -d R
+
+	$ZPOOL export $TESTPOOL
+
+	# Scramble the data on the first vdev in our pool.
+	# Skip the first and last 16MB of data, then scramble the rest after that
+	#
+	$FILE_WRITE -o overwrite -f $firstvdev -s $SKIP -c $WRITECNT -b $WRITESZ -d R
+
+	$ZPOOL import $TESTPOOL
+
+	i=${checksum_props[$count]}
+	$ZPOOL scrub $TESTPOOL
+	while is_pool_scrubbing $TESTPOOL; do
+                $SLEEP 1
+        done
+
+	$ZPOOL status -P -v $TESTPOOL | grep $firstvdev | read -r name state rd wr cksum
+
+	log_assert "Checksum '$i' caught $cksum checksum errors"
+	log_must [ $cksum -ne 0 ]
+
+	rm -f $TESTDIR/test_$i
+	$ZPOOL clear $TESTPOOL
+done
diff --git a/tests/zfs-tests/tests/functional/checksum/run_edonr_test.ksh b/tests/zfs-tests/tests/functional/checksum/run_edonr_test.ksh
new file mode 100755
index 000000000..7bcb321f2
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/run_edonr_test.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# Run the tests for the EdonR hash algorithm.
+#
+
+log_assert "Run the tests for the EdonR hash algorithm."
+
+freq=$(get_cpu_freq)
+log_must $STF_SUITE/tests/functional/checksum/edonr_test $freq
+
+log_pass "EdonR tests passed."
diff --git a/tests/zfs-tests/tests/functional/checksum/run_sha2_test.ksh b/tests/zfs-tests/tests/functional/checksum/run_sha2_test.ksh
new file mode 100755
index 000000000..589e28a7f
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/run_sha2_test.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# Run the tests for the SHA-2 hash algorithm.
+#
+
+log_assert "Run the tests for the SHA-2 hash algorithm."
+
+freq=$(get_cpu_freq)
+log_must $STF_SUITE/tests/functional/checksum/sha2_test $freq
+
+log_pass "SHA-2 tests passed."
diff --git a/tests/zfs-tests/tests/functional/checksum/run_skein_test.ksh b/tests/zfs-tests/tests/functional/checksum/run_skein_test.ksh
new file mode 100755
index 000000000..4290bfc79
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/run_skein_test.ksh
@@ -0,0 +1,30 @@
+#!/bin/ksh -p
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2015 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# Description:
+# Run the tests for the Skein hash algorithm.
+#
+
+log_assert "Run the tests for the Skein hash algorithm."
+
+freq=$(get_cpu_freq)
+log_must $STF_SUITE/tests/functional/checksum/skein_test $freq
+
+log_pass "Skein tests passed."
diff --git a/tests/zfs-tests/tests/functional/checksum/setup.ksh b/tests/zfs-tests/tests/functional/checksum/setup.ksh
new file mode 100755
index 000000000..27e125df4
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/setup.ksh
@@ -0,0 +1,31 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+DISK=${DISKS%% *}
+default_mirror_setup $DISKS
diff --git a/tests/zfs-tests/tests/functional/checksum/sha2_test.c b/tests/zfs-tests/tests/functional/checksum/sha2_test.c
new file mode 100644
index 000000000..afd6f8243
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/sha2_test.c
@@ -0,0 +1,265 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * This is just to keep the compiler happy about sys/time.h not declaring
+ * gettimeofday due to -D_KERNEL (we can do this since we're actually
+ * running in userspace, but we need -D_KERNEL for the remaining SHA2 code).
+ */
+#ifdef	_KERNEL
+#undef	_KERNEL
+#endif
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/time.h>
+#define	_SHA2_IMPL
+#include <sys/sha2.h>
+#define NOTE(x)
+typedef enum boolean { B_FALSE, B_TRUE } boolean_t;
+typedef	unsigned long long	u_longlong_t;
+
+
+/*
+ * Test messages from:
+ * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/SHA_All.pdf
+ */
+
+const char	*test_msg0 = "abc";
+const char	*test_msg1 = "abcdbcdecdefdefgefghfghighijhijkijkljklmklmn"
+	"lmnomnopnopq";
+const char	*test_msg2 = "abcdefghbcdefghicdefghijdefghijkefghijklfghi"
+	"jklmghijklmnhijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu";
+
+/*
+ * Test digests from:
+ * http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/SHA_All.pdf
+ */
+const uint8_t	sha256_test_digests[][32] = {
+	{
+		/* for test_msg0 */
+		0xBA, 0x78, 0x16, 0xBF, 0x8F, 0x01, 0xCF, 0xEA,
+		0x41, 0x41, 0x40, 0xDE, 0x5D, 0xAE, 0x22, 0x23,
+		0xB0, 0x03, 0x61, 0xA3, 0x96, 0x17, 0x7A, 0x9C,
+		0xB4, 0x10, 0xFF, 0x61, 0xF2, 0x00, 0x15, 0xAD
+	},
+	{
+		/* for test_msg1 */
+		0x24, 0x8D, 0x6A, 0x61, 0xD2, 0x06, 0x38, 0xB8,
+		0xE5, 0xC0, 0x26, 0x93, 0x0C, 0x3E, 0x60, 0x39,
+		0xA3, 0x3C, 0xE4, 0x59, 0x64, 0xFF, 0x21, 0x67,
+		0xF6, 0xEC, 0xED, 0xD4, 0x19, 0xDB, 0x06, 0xC1
+	}
+	/* no test vector for test_msg2 */
+};
+
+const uint8_t	sha384_test_digests[][48] = {
+	{
+		/* for test_msg0 */
+		0xCB, 0x00, 0x75, 0x3F, 0x45, 0xA3, 0x5E, 0x8B,
+		0xB5, 0xA0, 0x3D, 0x69, 0x9A, 0xC6, 0x50, 0x07,
+		0x27, 0x2C, 0x32, 0xAB, 0x0E, 0xDE, 0xD1, 0x63,
+		0x1A, 0x8B, 0x60, 0x5A, 0x43, 0xFF, 0x5B, 0xED,
+		0x80, 0x86, 0x07, 0x2B, 0xA1, 0xE7, 0xCC, 0x23,
+		0x58, 0xBA, 0xEC, 0xA1, 0x34, 0xC8, 0x25, 0xA7
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* for test_msg2 */
+		0x09, 0x33, 0x0C, 0x33, 0xF7, 0x11, 0x47, 0xE8,
+		0x3D, 0x19, 0x2F, 0xC7, 0x82, 0xCD, 0x1B, 0x47,
+		0x53, 0x11, 0x1B, 0x17, 0x3B, 0x3B, 0x05, 0xD2,
+		0x2F, 0xA0, 0x80, 0x86, 0xE3, 0xB0, 0xF7, 0x12,
+		0xFC, 0xC7, 0xC7, 0x1A, 0x55, 0x7E, 0x2D, 0xB9,
+		0x66, 0xC3, 0xE9, 0xFA, 0x91, 0x74, 0x60, 0x39
+	}
+};
+
+const uint8_t	sha512_test_digests[][64] = {
+	{
+		/* for test_msg0 */
+		0xDD, 0xAF, 0x35, 0xA1, 0x93, 0x61, 0x7A, 0xBA,
+		0xCC, 0x41, 0x73, 0x49, 0xAE, 0x20, 0x41, 0x31,
+		0x12, 0xE6, 0xFA, 0x4E, 0x89, 0xA9, 0x7E, 0xA2,
+		0x0A, 0x9E, 0xEE, 0xE6, 0x4B, 0x55, 0xD3, 0x9A,
+		0x21, 0x92, 0x99, 0x2A, 0x27, 0x4F, 0xC1, 0xA8,
+		0x36, 0xBA, 0x3C, 0x23, 0xA3, 0xFE, 0xEB, 0xBD,
+		0x45, 0x4D, 0x44, 0x23, 0x64, 0x3C, 0xE8, 0x0E,
+		0x2A, 0x9A, 0xC9, 0x4F, 0xA5, 0x4C, 0xA4, 0x9F
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* for test_msg2 */
+		0x8E, 0x95, 0x9B, 0x75, 0xDA, 0xE3, 0x13, 0xDA,
+		0x8C, 0xF4, 0xF7, 0x28, 0x14, 0xFC, 0x14, 0x3F,
+		0x8F, 0x77, 0x79, 0xC6, 0xEB, 0x9F, 0x7F, 0xA1,
+		0x72, 0x99, 0xAE, 0xAD, 0xB6, 0x88, 0x90, 0x18,
+		0x50, 0x1D, 0x28, 0x9E, 0x49, 0x00, 0xF7, 0xE4,
+		0x33, 0x1B, 0x99, 0xDE, 0xC4, 0xB5, 0x43, 0x3A,
+		0xC7, 0xD3, 0x29, 0xEE, 0xB6, 0xDD, 0x26, 0x54,
+		0x5E, 0x96, 0xE5, 0x5B, 0x87, 0x4B, 0xE9, 0x09
+	}
+};
+
+const uint8_t	sha512_224_test_digests[][28] = {
+	{
+		/* for test_msg0 */
+		0x46, 0x34, 0x27, 0x0F, 0x70, 0x7B, 0x6A, 0x54,
+		0xDA, 0xAE, 0x75, 0x30, 0x46, 0x08, 0x42, 0xE2,
+		0x0E, 0x37, 0xED, 0x26, 0x5C, 0xEE, 0xE9, 0xA4,
+		0x3E, 0x89, 0x24, 0xAA
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* for test_msg2 */
+		0x23, 0xFE, 0xC5, 0xBB, 0x94, 0xD6, 0x0B, 0x23,
+		0x30, 0x81, 0x92, 0x64, 0x0B, 0x0C, 0x45, 0x33,
+		0x35, 0xD6, 0x64, 0x73, 0x4F, 0xE4, 0x0E, 0x72,
+		0x68, 0x67, 0x4A, 0xF9
+	}
+};
+
+const uint8_t	sha512_256_test_digests[][32] = {
+	{
+		/* for test_msg0 */
+		0x53, 0x04, 0x8E, 0x26, 0x81, 0x94, 0x1E, 0xF9,
+		0x9B, 0x2E, 0x29, 0xB7, 0x6B, 0x4C, 0x7D, 0xAB,
+		0xE4, 0xC2, 0xD0, 0xC6, 0x34, 0xFC, 0x6D, 0x46,
+		0xE0, 0xE2, 0xF1, 0x31, 0x07, 0xE7, 0xAF, 0x23
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* for test_msg2 */
+		0x39, 0x28, 0xE1, 0x84, 0xFB, 0x86, 0x90, 0xF8,
+		0x40, 0xDA, 0x39, 0x88, 0x12, 0x1D, 0x31, 0xBE,
+		0x65, 0xCB, 0x9D, 0x3E, 0xF8, 0x3E, 0xE6, 0x14,
+		0x6F, 0xEA, 0xC8, 0x61, 0xE1, 0x9B, 0x56, 0x3A
+	}
+};
+
+/*
+ * Local reimplementation of cmn_err, since it's used in sha2.c.
+ */
+/*ARGSUSED*/
+void
+cmn_err(int level, char *format, ...)
+{
+	va_list ap;
+	va_start(ap, format);
+	/* LINTED: E_SEC_PRINTF_VAR_FMT */
+	(void) vfprintf(stderr, format, ap);
+	va_end(ap);
+}
+
+int
+main(int argc, char *argv[])
+{
+	boolean_t	failed = B_FALSE;
+	uint64_t	cpu_mhz = 0;
+
+	if (argc == 2)
+		cpu_mhz = atoi(argv[1]);
+
+#define	SHA2_ALGO_TEST(_m, mode, diglen, testdigest)			\
+	do {								\
+		SHA2_CTX		ctx;				\
+		uint8_t			digest[diglen / 8];		\
+		SHA2Init(SHA ## mode ## _MECH_INFO_TYPE, &ctx);		\
+		SHA2Update(&ctx, _m, strlen(_m));			\
+		SHA2Final(digest, &ctx);				\
+		(void) printf("SHA%-9sMessage: " #_m			\
+		    "\tResult: ", #mode);				\
+		if (bcmp(digest, testdigest, diglen / 8) == 0) {	\
+			(void) printf("OK\n");				\
+		} else {						\
+			(void) printf("FAILED!\n");			\
+			failed = B_TRUE;				\
+		}							\
+		NOTE(CONSTCOND)						\
+	} while (0)
+
+#define	SHA2_PERF_TEST(mode, diglen)					\
+	do {								\
+		SHA2_CTX	ctx;					\
+		uint8_t		digest[diglen / 8];			\
+		uint8_t		block[131072];				\
+		uint64_t	delta;					\
+		double		cpb = 0;				\
+		int		i;					\
+		struct timeval	start, end;				\
+		bzero(block, sizeof (block));				\
+		(void) gettimeofday(&start, NULL);			\
+		SHA2Init(SHA ## mode ## _MECH_INFO_TYPE, &ctx);		\
+		for (i = 0; i < 8192; i++)				\
+			SHA2Update(&ctx, block, sizeof (block));	\
+		SHA2Final(digest, &ctx);				\
+		(void) gettimeofday(&end, NULL);			\
+		delta = (end.tv_sec * 1000000llu + end.tv_usec) -	\
+		    (start.tv_sec * 1000000llu + start.tv_usec);	\
+		if (cpu_mhz != 0) {					\
+			cpb = (cpu_mhz * 1e6 * ((double)delta /		\
+			    1000000)) / (8192 * 128 * 1024);		\
+		}							\
+		(void) printf("SHA%-9s%llu us (%.02f CPB)\n", #mode,	\
+		    (u_longlong_t)delta, cpb);				\
+		NOTE(CONSTCOND)						\
+	} while (0)
+
+	(void) printf("Running algorithm correctness tests:\n");
+	SHA2_ALGO_TEST(test_msg0, 256, 256, sha256_test_digests[0]);
+	SHA2_ALGO_TEST(test_msg1, 256, 256, sha256_test_digests[1]);
+	SHA2_ALGO_TEST(test_msg0, 384, 384, sha384_test_digests[0]);
+	SHA2_ALGO_TEST(test_msg2, 384, 384, sha384_test_digests[2]);
+	SHA2_ALGO_TEST(test_msg0, 512, 512, sha512_test_digests[0]);
+	SHA2_ALGO_TEST(test_msg2, 512, 512, sha512_test_digests[2]);
+	SHA2_ALGO_TEST(test_msg0, 512_224, 224, sha512_224_test_digests[0]);
+	SHA2_ALGO_TEST(test_msg2, 512_224, 224, sha512_224_test_digests[2]);
+	SHA2_ALGO_TEST(test_msg0, 512_256, 256, sha512_256_test_digests[0]);
+	SHA2_ALGO_TEST(test_msg2, 512_256, 256, sha512_256_test_digests[2]);
+
+	if (failed)
+		return (1);
+
+	(void) printf("Running performance tests (hashing 1024 MiB of "
+	    "data):\n");
+	SHA2_PERF_TEST(256, 256);
+	SHA2_PERF_TEST(512, 512);
+
+	return (0);
+}
diff --git a/tests/zfs-tests/tests/functional/checksum/skein_test.c b/tests/zfs-tests/tests/functional/checksum/skein_test.c
new file mode 100644
index 000000000..37548f03b
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/checksum/skein_test.c
@@ -0,0 +1,342 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://opensource.org/licenses/CDDL-1.0.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
+
+/*
+ * This is just to keep the compiler happy about sys/time.h not declaring
+ * gettimeofday due to -D_KERNEL (we can do this since we're actually
+ * running in userspace, but we need -D_KERNEL for the remaining Skein code).
+ */
+#ifdef	_KERNEL
+#undef	_KERNEL
+#endif
+
+#include <sys/skein.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <stdio.h>
+#include <sys/time.h>
+#define NOTE(x)
+
+typedef	enum boolean { B_FALSE, B_TRUE } boolean_t;
+typedef	unsigned long long	u_longlong_t;
+
+/*
+ * Skein test suite using values from the Skein V1.3 specification found at:
+ * http://www.skein-hash.info/sites/default/files/skein1.3.pdf
+ */
+
+/*
+ * Test messages from the Skein spec, Appendix C.
+ */
+const uint8_t	test_msg0[] = {
+	0xFF
+};
+
+const uint8_t	test_msg1[] = {
+	0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA, 0xF9, 0xF8,
+	0xF7, 0xF6, 0xF5, 0xF4, 0xF3, 0xF2, 0xF1, 0xF0,
+	0xEF, 0xEE, 0xED, 0xEC, 0xEB, 0xEA, 0xE9, 0xE8,
+	0xE7, 0xE6, 0xE5, 0xE4, 0xE3, 0xE2, 0xE1, 0xE0
+};
+
+const uint8_t	test_msg2[] = {
+	0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA, 0xF9, 0xF8,
+	0xF7, 0xF6, 0xF5, 0xF4, 0xF3, 0xF2, 0xF1, 0xF0,
+	0xEF, 0xEE, 0xED, 0xEC, 0xEB, 0xEA, 0xE9, 0xE8,
+	0xE7, 0xE6, 0xE5, 0xE4, 0xE3, 0xE2, 0xE1, 0xE0,
+	0xDF, 0xDE, 0xDD, 0xDC, 0xDB, 0xDA, 0xD9, 0xD8,
+	0xD7, 0xD6, 0xD5, 0xD4, 0xD3, 0xD2, 0xD1, 0xD0,
+	0xCF, 0xCE, 0xCD, 0xCC, 0xCB, 0xCA, 0xC9, 0xC8,
+	0xC7, 0xC6, 0xC5, 0xC4, 0xC3, 0xC2, 0xC1, 0xC0
+};
+
+const uint8_t	test_msg3[] = {
+	0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA, 0xF9, 0xF8,
+	0xF7, 0xF6, 0xF5, 0xF4, 0xF3, 0xF2, 0xF1, 0xF0,
+	0xEF, 0xEE, 0xED, 0xEC, 0xEB, 0xEA, 0xE9, 0xE8,
+	0xE7, 0xE6, 0xE5, 0xE4, 0xE3, 0xE2, 0xE1, 0xE0,
+	0xDF, 0xDE, 0xDD, 0xDC, 0xDB, 0xDA, 0xD9, 0xD8,
+	0xD7, 0xD6, 0xD5, 0xD4, 0xD3, 0xD2, 0xD1, 0xD0,
+	0xCF, 0xCE, 0xCD, 0xCC, 0xCB, 0xCA, 0xC9, 0xC8,
+	0xC7, 0xC6, 0xC5, 0xC4, 0xC3, 0xC2, 0xC1, 0xC0,
+	0xBF, 0xBE, 0xBD, 0xBC, 0xBB, 0xBA, 0xB9, 0xB8,
+	0xB7, 0xB6, 0xB5, 0xB4, 0xB3, 0xB2, 0xB1, 0xB0,
+	0xAF, 0xAE, 0xAD, 0xAC, 0xAB, 0xAA, 0xA9, 0xA8,
+	0xA7, 0xA6, 0xA5, 0xA4, 0xA3, 0xA2, 0xA1, 0xA0,
+	0x9F, 0x9E, 0x9D, 0x9C, 0x9B, 0x9A, 0x99, 0x98,
+	0x97, 0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90,
+	0x8F, 0x8E, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x88,
+	0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80
+};
+
+const uint8_t	test_msg4[] = {
+	0xFF, 0xFE, 0xFD, 0xFC, 0xFB, 0xFA, 0xF9, 0xF8,
+	0xF7, 0xF6, 0xF5, 0xF4, 0xF3, 0xF2, 0xF1, 0xF0,
+	0xEF, 0xEE, 0xED, 0xEC, 0xEB, 0xEA, 0xE9, 0xE8,
+	0xE7, 0xE6, 0xE5, 0xE4, 0xE3, 0xE2, 0xE1, 0xE0,
+	0xDF, 0xDE, 0xDD, 0xDC, 0xDB, 0xDA, 0xD9, 0xD8,
+	0xD7, 0xD6, 0xD5, 0xD4, 0xD3, 0xD2, 0xD1, 0xD0,
+	0xCF, 0xCE, 0xCD, 0xCC, 0xCB, 0xCA, 0xC9, 0xC8,
+	0xC7, 0xC6, 0xC5, 0xC4, 0xC3, 0xC2, 0xC1, 0xC0,
+	0xBF, 0xBE, 0xBD, 0xBC, 0xBB, 0xBA, 0xB9, 0xB8,
+	0xB7, 0xB6, 0xB5, 0xB4, 0xB3, 0xB2, 0xB1, 0xB0,
+	0xAF, 0xAE, 0xAD, 0xAC, 0xAB, 0xAA, 0xA9, 0xA8,
+	0xA7, 0xA6, 0xA5, 0xA4, 0xA3, 0xA2, 0xA1, 0xA0,
+	0x9F, 0x9E, 0x9D, 0x9C, 0x9B, 0x9A, 0x99, 0x98,
+	0x97, 0x96, 0x95, 0x94, 0x93, 0x92, 0x91, 0x90,
+	0x8F, 0x8E, 0x8D, 0x8C, 0x8B, 0x8A, 0x89, 0x88,
+	0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80,
+	0x7F, 0x7E, 0x7D, 0x7C, 0x7B, 0x7A, 0x79, 0x78,
+	0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
+	0x6F, 0x6E, 0x6D, 0x6C, 0x6B, 0x6A, 0x69, 0x68,
+	0x67, 0x66, 0x65, 0x64, 0x63, 0x62, 0x61, 0x60,
+	0x5F, 0x5E, 0x5D, 0x5C, 0x5B, 0x5A, 0x59, 0x58,
+	0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50,
+	0x4F, 0x4E, 0x4D, 0x4C, 0x4B, 0x4A, 0x49, 0x48,
+	0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
+	0x3F, 0x3E, 0x3D, 0x3C, 0x3B, 0x3A, 0x39, 0x38,
+	0x37, 0x36, 0x35, 0x34, 0x33, 0x32, 0x31, 0x30,
+	0x2F, 0x2E, 0x2D, 0x2C, 0x2B, 0x2A, 0x29, 0x28,
+	0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20,
+	0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18,
+	0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
+	0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08,
+	0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+};
+
+/*
+ * Test digests from the Skein spec, Appendix C.
+ */
+const uint8_t	skein_256_test_digests[][32] = {
+	{
+		/* for test_msg0 */
+		0x0B, 0x98, 0xDC, 0xD1, 0x98, 0xEA, 0x0E, 0x50,
+		0xA7, 0xA2, 0x44, 0xC4, 0x44, 0xE2, 0x5C, 0x23,
+		0xDA, 0x30, 0xC1, 0x0F, 0xC9, 0xA1, 0xF2, 0x70,
+		0xA6, 0x63, 0x7F, 0x1F, 0x34, 0xE6, 0x7E, 0xD2
+	},
+	{
+		/* for test_msg1 */
+		0x8D, 0x0F, 0xA4, 0xEF, 0x77, 0x7F, 0xD7, 0x59,
+		0xDF, 0xD4, 0x04, 0x4E, 0x6F, 0x6A, 0x5A, 0xC3,
+		0xC7, 0x74, 0xAE, 0xC9, 0x43, 0xDC, 0xFC, 0x07,
+		0x92, 0x7B, 0x72, 0x3B, 0x5D, 0xBF, 0x40, 0x8B
+	},
+	{
+		/* for test_msg2 */
+		0xDF, 0x28, 0xE9, 0x16, 0x63, 0x0D, 0x0B, 0x44,
+		0xC4, 0xA8, 0x49, 0xDC, 0x9A, 0x02, 0xF0, 0x7A,
+		0x07, 0xCB, 0x30, 0xF7, 0x32, 0x31, 0x82, 0x56,
+		0xB1, 0x5D, 0x86, 0x5A, 0xC4, 0xAE, 0x16, 0x2F
+	}
+	/* no test digests for test_msg3 and test_msg4 */
+};
+
+const uint8_t	skein_512_test_digests[][64] = {
+	{
+		/* for test_msg0 */
+		0x71, 0xB7, 0xBC, 0xE6, 0xFE, 0x64, 0x52, 0x22,
+		0x7B, 0x9C, 0xED, 0x60, 0x14, 0x24, 0x9E, 0x5B,
+		0xF9, 0xA9, 0x75, 0x4C, 0x3A, 0xD6, 0x18, 0xCC,
+		0xC4, 0xE0, 0xAA, 0xE1, 0x6B, 0x31, 0x6C, 0xC8,
+		0xCA, 0x69, 0x8D, 0x86, 0x43, 0x07, 0xED, 0x3E,
+		0x80, 0xB6, 0xEF, 0x15, 0x70, 0x81, 0x2A, 0xC5,
+		0x27, 0x2D, 0xC4, 0x09, 0xB5, 0xA0, 0x12, 0xDF,
+		0x2A, 0x57, 0x91, 0x02, 0xF3, 0x40, 0x61, 0x7A
+	},
+	{
+		/* no test vector for test_msg1 */
+		0,
+	},
+	{
+		/* for test_msg2 */
+		0x45, 0x86, 0x3B, 0xA3, 0xBE, 0x0C, 0x4D, 0xFC,
+		0x27, 0xE7, 0x5D, 0x35, 0x84, 0x96, 0xF4, 0xAC,
+		0x9A, 0x73, 0x6A, 0x50, 0x5D, 0x93, 0x13, 0xB4,
+		0x2B, 0x2F, 0x5E, 0xAD, 0xA7, 0x9F, 0xC1, 0x7F,
+		0x63, 0x86, 0x1E, 0x94, 0x7A, 0xFB, 0x1D, 0x05,
+		0x6A, 0xA1, 0x99, 0x57, 0x5A, 0xD3, 0xF8, 0xC9,
+		0xA3, 0xCC, 0x17, 0x80, 0xB5, 0xE5, 0xFA, 0x4C,
+		0xAE, 0x05, 0x0E, 0x98, 0x98, 0x76, 0x62, 0x5B
+	},
+	{
+		/* for test_msg3 */
+		0x91, 0xCC, 0xA5, 0x10, 0xC2, 0x63, 0xC4, 0xDD,
+		0xD0, 0x10, 0x53, 0x0A, 0x33, 0x07, 0x33, 0x09,
+		0x62, 0x86, 0x31, 0xF3, 0x08, 0x74, 0x7E, 0x1B,
+		0xCB, 0xAA, 0x90, 0xE4, 0x51, 0xCA, 0xB9, 0x2E,
+		0x51, 0x88, 0x08, 0x7A, 0xF4, 0x18, 0x87, 0x73,
+		0xA3, 0x32, 0x30, 0x3E, 0x66, 0x67, 0xA7, 0xA2,
+		0x10, 0x85, 0x6F, 0x74, 0x21, 0x39, 0x00, 0x00,
+		0x71, 0xF4, 0x8E, 0x8B, 0xA2, 0xA5, 0xAD, 0xB7
+	}
+	/* no test digests for test_msg4 */
+};
+
+const uint8_t	skein_1024_test_digests[][128] = {
+	{
+		/* for test_msg0 */
+		0xE6, 0x2C, 0x05, 0x80, 0x2E, 0xA0, 0x15, 0x24,
+		0x07, 0xCD, 0xD8, 0x78, 0x7F, 0xDA, 0x9E, 0x35,
+		0x70, 0x3D, 0xE8, 0x62, 0xA4, 0xFB, 0xC1, 0x19,
+		0xCF, 0xF8, 0x59, 0x0A, 0xFE, 0x79, 0x25, 0x0B,
+		0xCC, 0xC8, 0xB3, 0xFA, 0xF1, 0xBD, 0x24, 0x22,
+		0xAB, 0x5C, 0x0D, 0x26, 0x3F, 0xB2, 0xF8, 0xAF,
+		0xB3, 0xF7, 0x96, 0xF0, 0x48, 0x00, 0x03, 0x81,
+		0x53, 0x1B, 0x6F, 0x00, 0xD8, 0x51, 0x61, 0xBC,
+		0x0F, 0xFF, 0x4B, 0xEF, 0x24, 0x86, 0xB1, 0xEB,
+		0xCD, 0x37, 0x73, 0xFA, 0xBF, 0x50, 0xAD, 0x4A,
+		0xD5, 0x63, 0x9A, 0xF9, 0x04, 0x0E, 0x3F, 0x29,
+		0xC6, 0xC9, 0x31, 0x30, 0x1B, 0xF7, 0x98, 0x32,
+		0xE9, 0xDA, 0x09, 0x85, 0x7E, 0x83, 0x1E, 0x82,
+		0xEF, 0x8B, 0x46, 0x91, 0xC2, 0x35, 0x65, 0x65,
+		0x15, 0xD4, 0x37, 0xD2, 0xBD, 0xA3, 0x3B, 0xCE,
+		0xC0, 0x01, 0xC6, 0x7F, 0xFD, 0xE1, 0x5B, 0xA8
+	},
+	{
+		/* no test vector for test_msg1 */
+		0
+	},
+	{
+		/* no test vector for test_msg2 */
+		0
+	},
+	{
+		/* for test_msg3 */
+		0x1F, 0x3E, 0x02, 0xC4, 0x6F, 0xB8, 0x0A, 0x3F,
+		0xCD, 0x2D, 0xFB, 0xBC, 0x7C, 0x17, 0x38, 0x00,
+		0xB4, 0x0C, 0x60, 0xC2, 0x35, 0x4A, 0xF5, 0x51,
+		0x18, 0x9E, 0xBF, 0x43, 0x3C, 0x3D, 0x85, 0xF9,
+		0xFF, 0x18, 0x03, 0xE6, 0xD9, 0x20, 0x49, 0x31,
+		0x79, 0xED, 0x7A, 0xE7, 0xFC, 0xE6, 0x9C, 0x35,
+		0x81, 0xA5, 0xA2, 0xF8, 0x2D, 0x3E, 0x0C, 0x7A,
+		0x29, 0x55, 0x74, 0xD0, 0xCD, 0x7D, 0x21, 0x7C,
+		0x48, 0x4D, 0x2F, 0x63, 0x13, 0xD5, 0x9A, 0x77,
+		0x18, 0xEA, 0xD0, 0x7D, 0x07, 0x29, 0xC2, 0x48,
+		0x51, 0xD7, 0xE7, 0xD2, 0x49, 0x1B, 0x90, 0x2D,
+		0x48, 0x91, 0x94, 0xE6, 0xB7, 0xD3, 0x69, 0xDB,
+		0x0A, 0xB7, 0xAA, 0x10, 0x6F, 0x0E, 0xE0, 0xA3,
+		0x9A, 0x42, 0xEF, 0xC5, 0x4F, 0x18, 0xD9, 0x37,
+		0x76, 0x08, 0x09, 0x85, 0xF9, 0x07, 0x57, 0x4F,
+		0x99, 0x5E, 0xC6, 0xA3, 0x71, 0x53, 0xA5, 0x78
+	},
+	{
+		/* for test_msg4 */
+		0x84, 0x2A, 0x53, 0xC9, 0x9C, 0x12, 0xB0, 0xCF,
+		0x80, 0xCF, 0x69, 0x49, 0x1B, 0xE5, 0xE2, 0xF7,
+		0x51, 0x5D, 0xE8, 0x73, 0x3B, 0x6E, 0xA9, 0x42,
+		0x2D, 0xFD, 0x67, 0x66, 0x65, 0xB5, 0xFA, 0x42,
+		0xFF, 0xB3, 0xA9, 0xC4, 0x8C, 0x21, 0x77, 0x77,
+		0x95, 0x08, 0x48, 0xCE, 0xCD, 0xB4, 0x8F, 0x64,
+		0x0F, 0x81, 0xFB, 0x92, 0xBE, 0xF6, 0xF8, 0x8F,
+		0x7A, 0x85, 0xC1, 0xF7, 0xCD, 0x14, 0x46, 0xC9,
+		0x16, 0x1C, 0x0A, 0xFE, 0x8F, 0x25, 0xAE, 0x44,
+		0x4F, 0x40, 0xD3, 0x68, 0x00, 0x81, 0xC3, 0x5A,
+		0xA4, 0x3F, 0x64, 0x0F, 0xD5, 0xFA, 0x3C, 0x3C,
+		0x03, 0x0B, 0xCC, 0x06, 0xAB, 0xAC, 0x01, 0xD0,
+		0x98, 0xBC, 0xC9, 0x84, 0xEB, 0xD8, 0x32, 0x27,
+		0x12, 0x92, 0x1E, 0x00, 0xB1, 0xBA, 0x07, 0xD6,
+		0xD0, 0x1F, 0x26, 0x90, 0x70, 0x50, 0x25, 0x5E,
+		0xF2, 0xC8, 0xE2, 0x4F, 0x71, 0x6C, 0x52, 0xA5
+	}
+};
+
+int
+main(int argc, char *argv[])
+{
+	boolean_t	failed = B_FALSE;
+	uint64_t	cpu_mhz = 0;
+
+	if (argc == 2)
+		cpu_mhz = atoi(argv[1]);
+
+#define	SKEIN_ALGO_TEST(_m, mode, diglen, testdigest)			\
+	do {								\
+		Skein ## mode ## _Ctxt_t	ctx;			\
+		uint8_t				digest[diglen / 8];	\
+		(void) Skein ## mode ## _Init(&ctx, diglen);		\
+		(void) Skein ## mode ## _Update(&ctx, _m, sizeof (_m));	\
+		(void) Skein ## mode ## _Final(&ctx, digest);		\
+		(void) printf("Skein" #mode "/" #diglen			\
+		    "\tMessage: " #_m "\tResult: ");			\
+		if (bcmp(digest, testdigest, diglen / 8) == 0) {	\
+			(void) printf("OK\n");				\
+		} else {						\
+			(void) printf("FAILED!\n");			\
+			failed = B_TRUE;				\
+		}							\
+		NOTE(CONSTCOND)						\
+	} while (0)
+
+#define	SKEIN_PERF_TEST(mode, diglen)					\
+	do {								\
+		Skein ## mode ## _Ctxt_t ctx;				\
+		uint8_t		digest[diglen / 8];			\
+		uint8_t		block[131072];				\
+		uint64_t	delta;					\
+		double		cpb = 0;				\
+		int		i;					\
+		struct timeval	start, end;				\
+		bzero(block, sizeof (block));				\
+		(void) gettimeofday(&start, NULL);			\
+		(void) Skein ## mode ## _Init(&ctx, diglen);		\
+		for (i = 0; i < 8192; i++) {				\
+			(void) Skein ## mode ## _Update(&ctx, block,	\
+			    sizeof (block));				\
+		}							\
+		(void) Skein ## mode ## _Final(&ctx, digest);		\
+		(void) gettimeofday(&end, NULL);			\
+		delta = (end.tv_sec * 1000000llu + end.tv_usec) -	\
+		    (start.tv_sec * 1000000llu + start.tv_usec);	\
+		if (cpu_mhz != 0) {					\
+			cpb = (cpu_mhz * 1e6 * ((double)delta /		\
+			    1000000)) / (8192 * 128 * 1024);		\
+		}							\
+		(void) printf("Skein" #mode "/" #diglen "\t%llu us "	\
+		    "(%.02f CPB)\n", (u_longlong_t)delta, cpb);		\
+		NOTE(CONSTCOND)						\
+	} while (0)
+
+	(void) printf("Running algorithm correctness tests:\n");
+	SKEIN_ALGO_TEST(test_msg0, _256, 256, skein_256_test_digests[0]);
+	SKEIN_ALGO_TEST(test_msg1, _256, 256, skein_256_test_digests[1]);
+	SKEIN_ALGO_TEST(test_msg2, _256, 256, skein_256_test_digests[2]);
+	SKEIN_ALGO_TEST(test_msg0, _512, 512, skein_512_test_digests[0]);
+	SKEIN_ALGO_TEST(test_msg2, _512, 512, skein_512_test_digests[2]);
+	SKEIN_ALGO_TEST(test_msg3, _512, 512, skein_512_test_digests[3]);
+	SKEIN_ALGO_TEST(test_msg0, 1024, 1024, skein_1024_test_digests[0]);
+	SKEIN_ALGO_TEST(test_msg3, 1024, 1024, skein_1024_test_digests[3]);
+	SKEIN_ALGO_TEST(test_msg4, 1024, 1024, skein_1024_test_digests[4]);
+	if (failed)
+		return (1);
+
+	(void) printf("Running performance tests (hashing 1024 MiB of "
+	    "data):\n");
+	SKEIN_PERF_TEST(_256, 256);
+	SKEIN_PERF_TEST(_512, 512);
+	SKEIN_PERF_TEST(1024, 1024);
+
+	return (0);
+}
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh
index edc7a3fb9..27003b21b 100755
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh
@@ -46,7 +46,7 @@
 verify_runnable "both"
 
 set -A dataset "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL/$TESTVOL"
-set -A values "on" "off" "fletcher2" "fletcher4" "sha256"
+set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "noparity"
 
 log_assert "Setting a valid checksum on a file system, volume," \
 	"it should be successful."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
index f7a1d9cb1..3807d0af6 100644
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -37,7 +37,8 @@ typeset -a properties=("size" "capacity" "altroot" "health" "guid" "version"
     "feature@async_destroy" "feature@empty_bpobj" "feature@lz4_compress"
     "feature@large_blocks" "feature@large_dnode" "feature@filesystem_limits"
     "feature@spacemap_histogram" "feature@enabled_txg" "feature@hole_birth"
-    "feature@extensible_dataset" "feature@bookmarks" "feature@embedded_data")
+    "feature@extensible_dataset" "feature@bookmarks" "feature@embedded_data"
+    "feature@sha512" "feature@skein" "feature@edonr")
 else
 typeset -a properties=("size" "capacity" "altroot" "health" "guid" "version"
     "bootfs" ""leaked" delegation" "autoreplace" "cachefile" "dedupditto" "dedupratio"
@@ -45,5 +46,6 @@ typeset -a properties=("size" "capacity" "altroot" "health" "guid" "version"
     "listsnapshots" "autoexpand" "feature@async_destroy" "feature@empty_bpobj"
     "feature@lz4_compress" "feature@multi_vdev_crash_dump"
     "feature@spacemap_histogram" "feature@enabled_txg" "feature@hole_birth"
-    "feature@extensible_dataset" "feature@bookmarks")
+    "feature@extensible_dataset" "feature@bookmarks" "feature@sha512"
+    "feature@skein" "feature@edonr")
 fi