diff options
Diffstat (limited to 'module/icp/asm-x86_64')
-rw-r--r-- | module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman | 23 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip | 1 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl | 127 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip | 1 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/aes_amd64.S | 900 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/aes_intel.S | 851 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/aeskey.c | 580 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/aesopt.h | 770 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/aestab.h | 165 | ||||
-rw-r--r-- | module/icp/asm-x86_64/aes/aestab2.h | 594 | ||||
-rw-r--r-- | module/icp/asm-x86_64/modes/gcm_intel.S | 334 | ||||
-rw-r--r-- | module/icp/asm-x86_64/sha1/sha1-x86_64.S | 1346 | ||||
-rw-r--r-- | module/icp/asm-x86_64/sha2/sha256_impl.S | 2060 |
13 files changed, 7752 insertions, 0 deletions
diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman new file mode 100644 index 000000000..48fea7bb3 --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman @@ -0,0 +1,23 @@ + --------------------------------------------------------------------------- + Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software is allowed (with or without + changes) provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip new file mode 100644 index 000000000..5f822cf27 --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip @@ -0,0 +1 @@ +PORTIONS OF AES FUNCTIONALITY diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl new file mode 100644 index 000000000..a2c4adcbe --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl @@ -0,0 +1,127 @@ + + LICENSE ISSUES + ============== + + The OpenSSL toolkit stays under a dual license, i.e. both the conditions of + the OpenSSL License and the original SSLeay license apply to the toolkit. + See below for the actual license texts. Actually both licenses are BSD-style + Open Source licenses. In case of any license issues related to OpenSSL + please contact [email protected]. + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * [email protected]. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * ([email protected]). This product includes software written by Tim + * Hudson ([email protected]). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young ([email protected]) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young ([email protected]). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson ([email protected]). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young ([email protected])" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson ([email protected])" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip new file mode 100644 index 000000000..5f822cf27 --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip @@ -0,0 +1 @@ +PORTIONS OF AES FUNCTIONALITY diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S new file mode 100644 index 000000000..fb6444119 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aes_amd64.S @@ -0,0 +1,900 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue 20/12/2007 + * + * I am grateful to Dag Arne Osvik for many discussions of the techniques that + * can be used to optimise AES assembler code on AMD64/EM64T architectures. + * Some of the techniques used in this implementation are the result of + * suggestions made by him for which I am most grateful. + * + * An AES implementation for AMD64 processors using the YASM assembler. This + * implementation provides only encryption, decryption and hence requires key + * scheduling support in C. It uses 8k bytes of tables but its encryption and + * decryption performance is very close to that obtained using large tables. + * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, + * which are as follows: + * ms windows gnu/linux/opensolaris os + * + * in_blk rcx rdi + * out_blk rdx rsi + * context (cx) r8 rdx + * + * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 + * registers rdi - on both + * + * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 + * registers - rdi on both + * + * The convention used here is that for gnu/linux/opensolaris os. + * + * This code provides the standard AES block size (128 bits, 16 bytes) and the + * three standard AES key sizes (128, 192 and 256 bits). It has the same call + * interface as my C implementation. It uses the Microsoft C AMD64 calling + * conventions in which the three parameters are placed in rcx, rdx and r8 + * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. + * + * OpenSolaris Note: + * Modified to use GNU/Linux/Solaris calling conventions. + * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. + * + * AES_RETURN aes_encrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key<NNN>(const unsigned char key[], + * const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key<NNN>(const unsigned char key[], + * const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * where <NNN> is 128, 102 or 256. In the last two calls the length can be in + * either bits or bytes. + * + * Comment in/out the following lines to obtain the desired subroutines. These + * selections MUST match those in the C header file aesopt.h + */ +#define AES_REV_DKS /* define if key decryption schedule is reversed */ + +#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ + +/* + * The encryption key schedule has the following in memory layout where N is the + * number of rounds (10, 12 or 14): + * + * lo: | input key (round 0) | / each round is four 32-bit words + * | encryption round 1 | + * | encryption round 2 | + * .... + * | encryption round N-1 | + * hi: | encryption round N | + * + * The decryption key schedule is normally set up so that it has the same + * layout as above by actually reversing the order of the encryption key + * schedule in memory (this happens when AES_REV_DKS is set): + * + * lo: | decryption round 0 | = | encryption round N | + * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] + * hi: | decryption round N | = | input key (round 0) | + * + * with rounds except the first and last modified using inv_mix_column() + * But if AES_REV_DKS is NOT set the order of keys is left as it is for + * encryption so that it has to be accessed in reverse when used for + * decryption (although the inverse mix column modifications are done) + * + * lo: | decryption round 0 | = | input key (round 0) | + * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] + * hi: | decryption round N | = | encryption round N | + * + * This layout is faster when the assembler key scheduling provided here + * is used. + * + * End of user defines + */ + +/* + * --------------------------------------------------------------------------- + * OpenSolaris OS modifications + * + * This source originates from Brian Gladman file aes_amd64.asm + * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip + * with these changes: + * + * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and + * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, + * AES_128, AES_192, AES_256, AES_VAR ifdefs. + * + * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define + * + * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef + * + * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax + * (operands reversed, literals prefixed with "$", registers prefixed with "%", + * and "[register+offset]", addressing changed to "offset(register)", + * parenthesis in constant expressions "()" changed to square brackets "[]", + * "." removed from local (numeric) labels, and other changes. + * Examples: + * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax + * mov rax,(4*20h) mov $[4*0x20],%rax + * mov rax,[ebx+20h] mov 0x20(%ebx),%rax + * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax + * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax + * + * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 6. Renamed functions and reordered parameters to match OpenSolaris: + * Original Gladman interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, + * and a union type, inf., containing inf.l, a uint32_t and + * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is + * used and contains the key schedule length * 16 where key schedule length is + * 10, 12, or 14 bytes. + * + * OpenSolaris OS interface: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ + * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + */ + +#if defined(lint) || defined(__lint) + +#include <sys/types.h> +/* ARGSUSED */ +void +aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} + + +#else + +#define _ASM +#include <sys/asm_linkage.h> + +#define KS_LENGTH 60 + +#define raxd eax +#define rdxd edx +#define rcxd ecx +#define rbxd ebx +#define rsid esi +#define rdid edi + +#define raxb al +#define rdxb dl +#define rcxb cl +#define rbxb bl +#define rsib sil +#define rdib dil + +// finite field multiplies by {02}, {04} and {08} + +#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] +#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] +#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] + +// finite field multiplies required in table generation + +#define f3(x) [[f2(x)] ^ [x]] +#define f9(x) [[f8(x)] ^ [x]] +#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] +#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] +#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] + +// macros for expanding S-box data + +#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] +#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] +#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 + +#define enc_vals(x) \ + .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ + .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ + .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ + .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ + .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ + .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ + .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ + .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ + .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ + .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ + .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ + .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ + .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ + .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ + .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ + .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ + .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ + .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ + .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ + .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ + .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ + .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ + .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ + .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ + .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ + .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ + .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ + .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ + .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ + .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ + .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ + .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) + +#define dec_vals(x) \ + .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ + .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ + .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ + .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ + .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ + .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ + .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ + .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ + .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ + .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ + .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ + .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ + .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ + .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ + .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ + .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ + .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ + .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ + .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ + .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ + .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ + .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ + .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ + .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ + .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ + .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ + .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ + .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ + .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ + .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ + .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ + .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) + +#define tptr %rbp /* table pointer */ +#define kptr %r8 /* key schedule pointer */ +#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ +#define fk_ref(x, y) -16*x+fofs+4*y(kptr) + +#ifdef AES_REV_DKS +#define rofs 128 +#define ik_ref(x, y) -16*x+rofs+4*y(kptr) + +#else +#define rofs -128 +#define ik_ref(x, y) 16*x+rofs+4*y(kptr) +#endif /* AES_REV_DKS */ + +#define tab_0(x) (tptr,x,8) +#define tab_1(x) 3(tptr,x,8) +#define tab_2(x) 2(tptr,x,8) +#define tab_3(x) 1(tptr,x,8) +#define tab_f(x) 1(tptr,x,8) +#define tab_i(x) 7(tptr,x,8) + +#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + add $2048, tptr; \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1 + +#else + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p1 + +#endif /* LAST_ROUND_TABLES */ + +#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + add $2048, tptr; \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3 + +#else + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %eax; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ebx; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p3 + +#endif /* LAST_ROUND_TABLES */ + +/* + * OpenSolaris OS: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ + .align 64 +enc_tab: + enc_vals(u8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + enc_vals(w8) +#endif + + + ENTRY_NP(aes_encrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $[4*8], %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea enc_tab(%rip), tptr + sub $fofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + + xor fofs(kptr), %eax + xor fofs+4(kptr), %ebx + xor fofs+8(kptr), %ecx + xor fofs+12(kptr), %edx + + lea (kptr,%rsi), kptr + // Jump based on byte key length * 16: + cmp $[10*16], %esi + je 3f + cmp $[12*16], %esi + je 2f + cmp $[14*16], %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal forward rounds +1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) + fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $[4*8], %rsp + ret + + SET_SIZE(aes_encrypt_amd64) + +/* + * OpenSolaris OS: + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ + .align 64 +dec_tab: + dec_vals(v8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + dec_vals(w8) +#endif + + + ENTRY_NP(aes_decrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $[4*8], %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea dec_tab(%rip), tptr + sub $rofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + +#ifdef AES_REV_DKS + mov kptr, %rdi + lea (kptr,%rsi), kptr +#else + lea (kptr,%rsi), %rdi +#endif + + xor rofs(%rdi), %eax + xor rofs+4(%rdi), %ebx + xor rofs+8(%rdi), %ecx + xor rofs+12(%rdi), %edx + + // Jump based on byte key length * 16: + cmp $[10*16], %esi + je 3f + cmp $[12*16], %esi + je 2f + cmp $[14*16], %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal inverse rounds +1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) + il_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $[4*8], %rsp + ret + + SET_SIZE(aes_decrypt_amd64) +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/aes/aes_intel.S b/module/icp/asm-x86_64/aes/aes_intel.S new file mode 100644 index 000000000..0b4700f96 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aes_intel.S @@ -0,0 +1,851 @@ +/* + * ==================================================================== + * Written by Intel Corporation for the OpenSSL project to add support + * for Intel AES-NI instructions. Rights for redistribution and usage + * in source and binary forms are granted according to the OpenSSL + * license. + * + * Author: Huang Ying <ying.huang at intel dot com> + * Vinodh Gopal <vinodh.gopal at intel dot com> + * Kahraman Akdemir + * + * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) + * instructions that are going to be introduced in the next generation + * of Intel processor, as of 2009. These instructions enable fast and + * secure data encryption and decryption, using the Advanced Encryption + * Standard (AES), defined by FIPS Publication number 197. The + * architecture introduces six instructions that offer full hardware + * support for AES. Four of them support high performance data + * encryption and decryption, and the other two instructions support + * the AES key expansion procedure. + * ==================================================================== + */ + +/* + * ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * [email protected]. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as files aes-intel.S and eng_aesni_asm.pl, in + * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by + * Huang Ying of Intel to the openssl-dev mailing list under the subject + * of "Add support to Intel AES-NI instruction set for x86_64 platform". + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Renamed functions, reordered parameters, and changed return value + * to match OpenSolaris: + * + * OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return values for above are non-zero on error, 0 on success. + * + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * typedef struct aes_key_st { + * unsigned int rd_key[4 *(AES_MAXNR + 1)]; + * int rounds; + * unsigned int pad[3]; + * } AES_KEY; + * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules + * (ks32) instead of 64-bit (ks64). + * Number of rounds (aka round count) is at offset 240 of AES_KEY. + * + * OpenSolaris OS interface (#ifdefs removed for readability): + * int rijndael_key_setup_dec_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * int rijndael_key_setup_enc_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * Return values for above are 0 on error, number of rounds on success. + * + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; + * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; + * + * typedef union { + * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; + * } aes_ks_t; + * typedef struct aes_key { + * aes_ks_t encr_ks, decr_ks; + * long double align128; + * int flags, nr, type; + * } aes_key_t; + * + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + * + * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. + * + * ==================================================================== + */ + +#if defined(lint) || defined(__lint) + +#include <sys/types.h> + +/* ARGSUSED */ +void +aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} +/* ARGSUSED */ +int +rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} +/* ARGSUSED */ +int +rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} + + +#else /* lint */ + +#define _ASM +#include <sys/asm_linkage.h> + +#ifdef _KERNEL + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we dont care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + +#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $[XMM_SIZE * 2], %rsp; \ + movaps %xmm0, 16(%rsp); \ + movaps %xmm1, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + /* + * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm1; \ + movaps 16(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $[XMM_SIZE * 7], %rsp; \ + movaps %xmm0, 96(%rsp); \ + movaps %xmm1, 80(%rsp); \ + movaps %xmm2, 64(%rsp); \ + movaps %xmm3, 48(%rsp); \ + movaps %xmm4, 32(%rsp); \ + movaps %xmm5, 16(%rsp); \ + movaps %xmm6, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm6; \ + movaps 16(%rsp), %xmm5; \ + movaps 32(%rsp), %xmm4; \ + movaps 48(%rsp), %xmm3; \ + movaps 64(%rsp), %xmm2; \ + movaps 80(%rsp), %xmm1; \ + movaps 96(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) +#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) +#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) +#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) +#endif /* _KERNEL */ + + +/* + * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), + * _key_expansion_256a(), _key_expansion_256b() + * + * Helper functions called by rijndael_key_setup_inc_intel(). + * Also used indirectly by rijndael_key_setup_dec_intel(). + * + * Input: + * %xmm0 User-provided cipher key + * %xmm1 Round constant + * Output: + * (%rcx) AES key + */ + +.align 16 +_key_expansion_128: +_key_expansion_256a: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_128) + SET_SIZE(_key_expansion_256a) + +.align 16 +_key_expansion_192a: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + movaps %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movaps %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movaps %xmm1, 0x10(%rcx) + add $0x20, %rcx + ret + SET_SIZE(_key_expansion_192a) + +.align 16 +_key_expansion_192b: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movaps %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movaps %xmm0, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_192b) + +.align 16 +_key_expansion_256b: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movaps %xmm2, (%rcx) + add $0x10, %rcx + ret + SET_SIZE(_key_expansion_256b) + + +/* + * rijndael_key_setup_enc_intel() + * Expand the cipher key into the encryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * + * Original Intel OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ + +#ifdef OPENSSL_INTERFACE +#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key +#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key + +#define USERCIPHERKEY rdi /* P1, 64 bits */ +#define KEYSIZE32 esi /* P2, 32 bits */ +#define KEYSIZE64 rsi /* P2, 64 bits */ +#define AESKEY rdx /* P3, 64 bits */ + +#else /* OpenSolaris Interface */ +#define AESKEY rdi /* P1, 64 bits */ +#define USERCIPHERKEY rsi /* P2, 64 bits */ +#define KEYSIZE32 edx /* P3, 32 bits */ +#define KEYSIZE64 rdx /* P3, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define ROUNDS32 KEYSIZE32 /* temp */ +#define ROUNDS64 KEYSIZE64 /* temp */ +#define ENDAESKEY USERCIPHERKEY /* temp */ + +ENTRY_NP(rijndael_key_setup_enc_intel) +rijndael_key_setup_enc_intel_local: + CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10) + + // NULL pointer sanity check + test %USERCIPHERKEY, %USERCIPHERKEY + jz .Lenc_key_invalid_param + test %AESKEY, %AESKEY + jz .Lenc_key_invalid_param + + movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) + movaps %xmm0, (%AESKEY) + lea 0x10(%AESKEY), %rcx // key addr + pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x + + cmp $256, %KEYSIZE32 + jnz .Lenc_key192 + + // AES 256: 14 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $14, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 +#endif /* OPENSSL_INTERFACE */ + + movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) + movaps %xmm2, (%rcx) + add $0x10, %rcx + + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x1, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x2, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x4, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x8, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x10, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + aeskeygenassist $0x20, %xmm0, %xmm1 + call _key_expansion_256b + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_256a + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* Open Solaris Interface */ + mov $14, %rax // return # rounds = 14 +#endif + ret + +.align 4 +.Lenc_key192: + cmp $192, %KEYSIZE32 + jnz .Lenc_key128 + + // AES 192: 12 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $12, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 +#endif /* OPENSSL_INTERFACE */ + + movq 0x10(%USERCIPHERKEY), %xmm2 // other user key + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_192a + aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key + call _key_expansion_192b + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $12, %rax // return # rounds = 12 +#endif + ret + +.align 4 +.Lenc_key128: + cmp $128, %KEYSIZE32 + jnz .Lenc_key_invalid_key_bits + + // AES 128: 10 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $10, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 +#endif /* OPENSSL_INTERFACE */ + + aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key + call _key_expansion_128 + + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $10, %rax // return # rounds = 10 +#endif + ret + +.Lenc_key_invalid_param: +#ifdef OPENSSL_INTERFACE + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) + mov $-1, %rax // user key or AES key pointer is NULL + ret +#else + /* FALLTHROUGH */ +#endif /* OPENSSL_INTERFACE */ + +.Lenc_key_invalid_key_bits: + SET_TS_OR_POP_XMM0_TO_XMM6(%r10) +#ifdef OPENSSL_INTERFACE + mov $-2, %rax // keysize is invalid +#else /* Open Solaris Interface */ + xor %rax, %rax // a key pointer is NULL or invalid keysize +#endif /* OPENSSL_INTERFACE */ + + ret + SET_SIZE(rijndael_key_setup_enc_intel) + + +/* + * rijndael_key_setup_dec_intel() + * Expand the cipher key into the decryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * P1->P2, P2->P3, P3->P1 + * + * Original Intel OpenSSL interface: + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ +ENTRY_NP(rijndael_key_setup_dec_intel) + // Generate round keys used for encryption + call rijndael_key_setup_enc_intel_local + test %rax, %rax +#ifdef OPENSSL_INTERFACE + jnz .Ldec_key_exit // Failed if returned non-0 +#else /* OpenSolaris Interface */ + jz .Ldec_key_exit // Failed if returned 0 +#endif /* OPENSSL_INTERFACE */ + + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + /* + * Convert round keys used for encryption + * to a form usable for decryption + */ +#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ + mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) + // (already set for OpenSSL) +#endif + + lea 0x10(%AESKEY), %rcx // key addr + shl $4, %ROUNDS32 + add %AESKEY, %ROUNDS64 + mov %ROUNDS64, %ENDAESKEY + +.align 4 +.Ldec_key_reorder_loop: + movaps (%AESKEY), %xmm0 + movaps (%ROUNDS64), %xmm1 + movaps %xmm0, (%ROUNDS64) + movaps %xmm1, (%AESKEY) + lea 0x10(%AESKEY), %AESKEY + lea -0x10(%ROUNDS64), %ROUNDS64 + cmp %AESKEY, %ROUNDS64 + ja .Ldec_key_reorder_loop + +.align 4 +.Ldec_key_inv_loop: + movaps (%rcx), %xmm0 + // Convert an encryption round key to a form usable for decryption + // with the "AES Inverse Mix Columns" instruction + aesimc %xmm0, %xmm1 + movaps %xmm1, (%rcx) + lea 0x10(%rcx), %rcx + cmp %ENDAESKEY, %rcx + jnz .Ldec_key_inv_loop + + SET_TS_OR_POP_XMM0_XMM1(%r10) + +.Ldec_key_exit: + // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error + // OpenSSL: rax = 0 for OK, or non-zero for error + ret + SET_SIZE(rijndael_key_setup_dec_intel) + + +/* + * aes_encrypt_intel() + * Encrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]) + * + * Original Intel OpenSSL Interface: + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key) + */ + +#ifdef OPENSSL_INTERFACE +#define aes_encrypt_intel intel_AES_encrypt +#define aes_decrypt_intel intel_AES_decrypt + +#define INP rdi /* P1, 64 bits */ +#define OUTP rsi /* P2, 64 bits */ +#define KEYP rdx /* P3, 64 bits */ + +/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ +#define NROUNDS32 ecx /* temporary, 32 bits */ +#define NROUNDS cl /* temporary, 8 bits */ + +#else /* OpenSolaris Interface */ +#define KEYP rdi /* P1, 64 bits */ +#define NROUNDS esi /* P2, 32 bits */ +#define INP rdx /* P3, 64 bits */ +#define OUTP rcx /* P4, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define STATE xmm0 /* temporary, 128 bits */ +#define KEY xmm1 /* temporary, 128 bits */ + +ENTRY_NP(aes_encrypt_intel) + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + movups (%INP), %STATE // input + movaps (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Lenc128 + lea 0x20(%KEYP), %KEYP + je .Lenc192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movaps -0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movaps -0x50(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc192: + // AES 192 and 256 + movaps -0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movaps -0x30(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc128: + // AES 128, 192, and 256 + movaps -0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movaps -0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movaps (%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x30(%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x50(%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movaps 0x70(%KEYP), %KEY + aesenclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + SET_TS_OR_POP_XMM0_XMM1(%r10) + ret + SET_SIZE(aes_encrypt_intel) + + +/* + * aes_decrypt_intel() + * Decrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original Intel OpenSSL Interface: + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + */ +ENTRY_NP(aes_decrypt_intel) + CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10) + + movups (%INP), %STATE // input + movaps (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Ldec128 + lea 0x20(%KEYP), %KEYP + je .Ldec192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movaps -0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movaps -0x50(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec192: + // AES 192 and 256 + movaps -0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movaps -0x30(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec128: + // AES 128, 192, and 256 + movaps -0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movaps -0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movaps (%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x30(%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x50(%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movaps 0x70(%KEYP), %KEY + aesdeclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + SET_TS_OR_POP_XMM0_XMM1(%r10) + ret + SET_SIZE(aes_decrypt_intel) + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/aes/aeskey.c b/module/icp/asm-x86_64/aes/aeskey.c new file mode 100644 index 000000000..96767fbea --- /dev/null +++ b/module/icp/asm-x86_64/aes/aeskey.c @@ -0,0 +1,580 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue Date: 20/12/2007 + */ + +#include <aes/aes_impl.h> +#include "aesopt.h" +#include "aestab.h" +#include "aestab2.h" + +/* + * Initialise the key schedule from the user supplied key. The key + * length can be specified in bytes, with legal values of 16, 24 + * and 32, or in bits, with legal values of 128, 192 and 256. These + * values correspond with Nk values of 4, 6 and 8 respectively. + * + * The following macros implement a single cycle in the key + * schedule generation process. The number of cycles needed + * for each cx->n_col and nk value is: + * + * nk = 4 5 6 7 8 + * ------------------------------ + * cx->n_col = 4 10 9 8 7 7 + * cx->n_col = 5 14 11 10 9 9 + * cx->n_col = 6 19 15 12 11 11 + * cx->n_col = 7 21 19 16 13 14 + * cx->n_col = 8 29 23 19 17 14 + */ + +/* + * OpenSolaris changes + * 1. Added header files aes_impl.h and aestab2.h + * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t + * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined) + * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C, + * AES_128, AES_192, AES_256, AES_VAR defines + * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void" + * 6. Changed N_COLS to MAX_AES_NB + * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with + * OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and + * rijndael_key_setup_dec_amd64 + * 8. cstyled code and removed lint warnings + */ + +#if defined(REDUCE_CODE_SIZE) +#define ls_box ls_sub + uint32_t ls_sub(const uint32_t t, const uint32_t n); +#define inv_mcol im_sub + uint32_t im_sub(const uint32_t x); +#ifdef ENC_KS_UNROLL +#undef ENC_KS_UNROLL +#endif +#ifdef DEC_KS_UNROLL +#undef DEC_KS_UNROLL +#endif +#endif /* REDUCE_CODE_SIZE */ + + +#define ke4(k, i) \ +{ k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[4 * (i) + 5] = ss[1] ^= ss[0]; \ + k[4 * (i) + 6] = ss[2] ^= ss[1]; \ + k[4 * (i) + 7] = ss[3] ^= ss[2]; \ +} + +static void +aes_encrypt_key128(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[4]; + + rk[0] = ss[0] = word_in(key, 0); + rk[1] = ss[1] = word_in(key, 1); + rk[2] = ss[2] = word_in(key, 2); + rk[3] = ss[3] = word_in(key, 3); + +#ifdef ENC_KS_UNROLL + ke4(rk, 0); ke4(rk, 1); + ke4(rk, 2); ke4(rk, 3); + ke4(rk, 4); ke4(rk, 5); + ke4(rk, 6); ke4(rk, 7); + ke4(rk, 8); +#else + { + uint32_t i; + for (i = 0; i < 9; ++i) + ke4(rk, i); + } +#endif /* ENC_KS_UNROLL */ + ke4(rk, 9); +} + + +#define kef6(k, i) \ +{ k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[6 * (i) + 7] = ss[1] ^= ss[0]; \ + k[6 * (i) + 8] = ss[2] ^= ss[1]; \ + k[6 * (i) + 9] = ss[3] ^= ss[2]; \ +} + +#define ke6(k, i) \ +{ kef6(k, i); \ + k[6 * (i) + 10] = ss[4] ^= ss[3]; \ + k[6 * (i) + 11] = ss[5] ^= ss[4]; \ +} + +static void +aes_encrypt_key192(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[6]; + + rk[0] = ss[0] = word_in(key, 0); + rk[1] = ss[1] = word_in(key, 1); + rk[2] = ss[2] = word_in(key, 2); + rk[3] = ss[3] = word_in(key, 3); + rk[4] = ss[4] = word_in(key, 4); + rk[5] = ss[5] = word_in(key, 5); + +#ifdef ENC_KS_UNROLL + ke6(rk, 0); ke6(rk, 1); + ke6(rk, 2); ke6(rk, 3); + ke6(rk, 4); ke6(rk, 5); + ke6(rk, 6); +#else + { + uint32_t i; + for (i = 0; i < 7; ++i) + ke6(rk, i); + } +#endif /* ENC_KS_UNROLL */ + kef6(rk, 7); +} + + + +#define kef8(k, i) \ +{ k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[8 * (i) + 9] = ss[1] ^= ss[0]; \ + k[8 * (i) + 10] = ss[2] ^= ss[1]; \ + k[8 * (i) + 11] = ss[3] ^= ss[2]; \ +} + +#define ke8(k, i) \ +{ kef8(k, i); \ + k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \ + k[8 * (i) + 13] = ss[5] ^= ss[4]; \ + k[8 * (i) + 14] = ss[6] ^= ss[5]; \ + k[8 * (i) + 15] = ss[7] ^= ss[6]; \ +} + +static void +aes_encrypt_key256(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[8]; + + rk[0] = ss[0] = word_in(key, 0); + rk[1] = ss[1] = word_in(key, 1); + rk[2] = ss[2] = word_in(key, 2); + rk[3] = ss[3] = word_in(key, 3); + rk[4] = ss[4] = word_in(key, 4); + rk[5] = ss[5] = word_in(key, 5); + rk[6] = ss[6] = word_in(key, 6); + rk[7] = ss[7] = word_in(key, 7); + +#ifdef ENC_KS_UNROLL + ke8(rk, 0); ke8(rk, 1); + ke8(rk, 2); ke8(rk, 3); + ke8(rk, 4); ke8(rk, 5); +#else + { + uint32_t i; + for (i = 0; i < 6; ++i) + ke8(rk, i); + } +#endif /* ENC_KS_UNROLL */ + kef8(rk, 6); +} + + +/* + * Expand the cipher key into the encryption key schedule. + * + * Return the number of rounds for the given cipher key size. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4 * (Nr + 1). + * + * Parameters: + * rk AES key schedule 32-bit array to be initialized + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + */ +int +rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[], + int keyBits) +{ + switch (keyBits) { + case 128: + aes_encrypt_key128((unsigned char *)&cipherKey[0], rk); + return (10); + case 192: + aes_encrypt_key192((unsigned char *)&cipherKey[0], rk); + return (12); + case 256: + aes_encrypt_key256((unsigned char *)&cipherKey[0], rk); + return (14); + default: /* should never get here */ + break; + } + + return (0); +} + + +/* this is used to store the decryption round keys */ +/* in forward or reverse order */ + +#ifdef AES_REV_DKS +#define v(n, i) ((n) - (i) + 2 * ((i) & 3)) +#else +#define v(n, i) (i) +#endif + +#if DEC_ROUND == NO_TABLES +#define ff(x) (x) +#else +#define ff(x) inv_mcol(x) +#if defined(dec_imvars) +#define d_vars dec_imvars +#endif +#endif /* FUNCS_IN_C & DEC_KEYING_IN_C */ + + +#define k4e(k, i) \ +{ k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \ + k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \ + k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \ +} + +#if 1 + +#define kdf4(k, i) \ +{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \ + ss[1] = ss[1] ^ ss[3]; \ + ss[2] = ss[2] ^ ss[3]; \ + ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ + ss[i % 4] ^= ss[4]; \ + ss[4] ^= k[v(40, (4 * (i)))]; k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \ + ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \ + ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \ + ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \ +} + +#define kd4(k, i) \ +{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ + ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ + k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \ + k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \ + k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \ + k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \ +} + +#define kdl4(k, i) \ +{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ + ss[i % 4] ^= ss[4]; \ + k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \ + k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \ + k[v(40, (4 * (i)) + 6)] = ss[0]; \ + k[v(40, (4 * (i)) + 7)] = ss[1]; \ +} + +#else + +#define kdf4(k, i) \ +{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \ + ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \ + ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \ +} + +#define kd4(k, i) \ +{ ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + ss[0] ^= ss[4]; \ + ss[4] = ff(ss[4]); \ + k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \ + ss[1] ^= ss[0]; \ + k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \ + ss[2] ^= ss[1]; \ + k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \ + ss[3] ^= ss[2]; \ + k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \ +} + +#define kdl4(k, i) \ +{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[v(40, (4 * (i)) + 4)] = ss[0]; \ + ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \ + ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \ + ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \ +} + +#endif + +static void +aes_decrypt_key128(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[5]; +#if defined(d_vars) + d_vars; +#endif + rk[v(40, (0))] = ss[0] = word_in(key, 0); + rk[v(40, (1))] = ss[1] = word_in(key, 1); + rk[v(40, (2))] = ss[2] = word_in(key, 2); + rk[v(40, (3))] = ss[3] = word_in(key, 3); + +#ifdef DEC_KS_UNROLL + kdf4(rk, 0); kd4(rk, 1); + kd4(rk, 2); kd4(rk, 3); + kd4(rk, 4); kd4(rk, 5); + kd4(rk, 6); kd4(rk, 7); + kd4(rk, 8); kdl4(rk, 9); +#else + { + uint32_t i; + for (i = 0; i < 10; ++i) + k4e(rk, i); +#if !(DEC_ROUND == NO_TABLES) + for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i) + rk[i] = inv_mcol(rk[i]); +#endif + } +#endif /* DEC_KS_UNROLL */ +} + + + +#define k6ef(k, i) \ +{ k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \ + k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \ + k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \ +} + +#define k6e(k, i) \ +{ k6ef(k, i); \ + k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \ + k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \ +} + +#define kdf6(k, i) \ +{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \ + ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \ + ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \ + ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \ + ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \ +} + +#define kd6(k, i) \ +{ ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \ + k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \ + ss[1] ^= ss[0]; \ + k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \ + ss[2] ^= ss[1]; \ + k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \ + ss[3] ^= ss[2]; \ + k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \ + ss[4] ^= ss[3]; \ + k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \ + ss[5] ^= ss[4]; \ + k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \ +} + +#define kdl6(k, i) \ +{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[v(48, (6 * (i)) + 6)] = ss[0]; \ + ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \ + ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \ + ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \ +} + +static void +aes_decrypt_key192(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[7]; +#if defined(d_vars) + d_vars; +#endif + rk[v(48, (0))] = ss[0] = word_in(key, 0); + rk[v(48, (1))] = ss[1] = word_in(key, 1); + rk[v(48, (2))] = ss[2] = word_in(key, 2); + rk[v(48, (3))] = ss[3] = word_in(key, 3); + +#ifdef DEC_KS_UNROLL + ss[4] = word_in(key, 4); + rk[v(48, (4))] = ff(ss[4]); + ss[5] = word_in(key, 5); + rk[v(48, (5))] = ff(ss[5]); + kdf6(rk, 0); kd6(rk, 1); + kd6(rk, 2); kd6(rk, 3); + kd6(rk, 4); kd6(rk, 5); + kd6(rk, 6); kdl6(rk, 7); +#else + rk[v(48, (4))] = ss[4] = word_in(key, 4); + rk[v(48, (5))] = ss[5] = word_in(key, 5); + { + uint32_t i; + + for (i = 0; i < 7; ++i) + k6e(rk, i); + k6ef(rk, 7); +#if !(DEC_ROUND == NO_TABLES) + for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i) + rk[i] = inv_mcol(rk[i]); +#endif + } +#endif +} + + + +#define k8ef(k, i) \ +{ k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \ + k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \ + k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \ +} + +#define k8e(k, i) \ +{ k8ef(k, i); \ + k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \ + k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \ + k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \ + k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \ +} + +#define kdf8(k, i) \ +{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \ + ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \ + ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \ + ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \ + ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \ + ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \ + ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \ +} + +#define kd8(k, i) \ +{ ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + ss[0] ^= ss[8]; \ + ss[8] = ff(ss[8]); \ + k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \ + ss[1] ^= ss[0]; \ + k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \ + ss[2] ^= ss[1]; \ + k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \ + ss[3] ^= ss[2]; \ + k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \ + ss[8] = ls_box(ss[3], 0); \ + ss[4] ^= ss[8]; \ + ss[8] = ff(ss[8]); \ + k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \ + ss[5] ^= ss[4]; \ + k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \ + ss[6] ^= ss[5]; \ + k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \ + ss[7] ^= ss[6]; \ + k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \ +} + +#define kdl8(k, i) \ +{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[v(56, (8 * (i)) + 8)] = ss[0]; \ + ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \ + ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \ + ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \ +} + +static void +aes_decrypt_key256(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[9]; +#if defined(d_vars) + d_vars; +#endif + rk[v(56, (0))] = ss[0] = word_in(key, 0); + rk[v(56, (1))] = ss[1] = word_in(key, 1); + rk[v(56, (2))] = ss[2] = word_in(key, 2); + rk[v(56, (3))] = ss[3] = word_in(key, 3); + +#ifdef DEC_KS_UNROLL + ss[4] = word_in(key, 4); + rk[v(56, (4))] = ff(ss[4]); + ss[5] = word_in(key, 5); + rk[v(56, (5))] = ff(ss[5]); + ss[6] = word_in(key, 6); + rk[v(56, (6))] = ff(ss[6]); + ss[7] = word_in(key, 7); + rk[v(56, (7))] = ff(ss[7]); + kdf8(rk, 0); kd8(rk, 1); + kd8(rk, 2); kd8(rk, 3); + kd8(rk, 4); kd8(rk, 5); + kdl8(rk, 6); +#else + rk[v(56, (4))] = ss[4] = word_in(key, 4); + rk[v(56, (5))] = ss[5] = word_in(key, 5); + rk[v(56, (6))] = ss[6] = word_in(key, 6); + rk[v(56, (7))] = ss[7] = word_in(key, 7); + { + uint32_t i; + + for (i = 0; i < 6; ++i) + k8e(rk, i); + k8ef(rk, 6); +#if !(DEC_ROUND == NO_TABLES) + for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i) + rk[i] = inv_mcol(rk[i]); +#endif + } +#endif /* DEC_KS_UNROLL */ +} + + +/* + * Expand the cipher key into the decryption key schedule. + * + * Return the number of rounds for the given cipher key size. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4 * (Nr + 1). + * + * Parameters: + * rk AES key schedule 32-bit array to be initialized + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + */ +int +rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[], + int keyBits) +{ + switch (keyBits) { + case 128: + aes_decrypt_key128((unsigned char *)&cipherKey[0], rk); + return (10); + case 192: + aes_decrypt_key192((unsigned char *)&cipherKey[0], rk); + return (12); + case 256: + aes_decrypt_key256((unsigned char *)&cipherKey[0], rk); + return (14); + default: /* should never get here */ + break; + } + + return (0); +} diff --git a/module/icp/asm-x86_64/aes/aesopt.h b/module/icp/asm-x86_64/aes/aesopt.h new file mode 100644 index 000000000..6aa61db82 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aesopt.h @@ -0,0 +1,770 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue Date: 20/12/2007 + * + * This file contains the compilation options for AES (Rijndael) and code + * that is common across encryption, key scheduling and table generation. + * + * OPERATION + * + * These source code files implement the AES algorithm Rijndael designed by + * Joan Daemen and Vincent Rijmen. This version is designed for the standard + * block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 + * and 32 bytes). + * + * This version is designed for flexibility and speed using operations on + * 32-bit words rather than operations on bytes. It can be compiled with + * either big or little endian internal byte order but is faster when the + * native byte order for the processor is used. + * + * THE CIPHER INTERFACE + * + * The cipher interface is implemented as an array of bytes in which lower + * AES bit sequence indexes map to higher numeric significance within bytes. + */ + +/* + * OpenSolaris changes + * 1. Added __cplusplus and _AESTAB_H header guards + * 2. Added header files sys/types.h and aes_impl.h + * 3. Added defines for AES_ENCRYPT, AES_DECRYPT, AES_REV_DKS, and ASM_AMD64_C + * 4. Moved defines for IS_BIG_ENDIAN, IS_LITTLE_ENDIAN, PLATFORM_BYTE_ORDER + * from brg_endian.h + * 5. Undefined VIA_ACE_POSSIBLE and ASSUME_VIA_ACE_PRESENT + * 6. Changed uint_8t and uint_32t to uint8_t and uint32_t + * 7. Defined aes_sw32 as htonl() for byte swapping + * 8. Cstyled and hdrchk code + * + */ + +#ifndef _AESOPT_H +#define _AESOPT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/zfs_context.h> +#include <aes/aes_impl.h> + +/* SUPPORT FEATURES */ +#define AES_ENCRYPT /* if support for encryption is needed */ +#define AES_DECRYPT /* if support for decryption is needed */ + +/* PLATFORM-SPECIFIC FEATURES */ +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#define AES_REV_DKS /* define to reverse decryption key schedule */ + + +/* + * CONFIGURATION - THE USE OF DEFINES + * Later in this section there are a number of defines that control the + * operation of the code. In each section, the purpose of each define is + * explained so that the relevant form can be included or excluded by + * setting either 1's or 0's respectively on the branches of the related + * #if clauses. The following local defines should not be changed. + */ + +#define ENCRYPTION_IN_C 1 +#define DECRYPTION_IN_C 2 +#define ENC_KEYING_IN_C 4 +#define DEC_KEYING_IN_C 8 + +#define NO_TABLES 0 +#define ONE_TABLE 1 +#define FOUR_TABLES 4 +#define NONE 0 +#define PARTIAL 1 +#define FULL 2 + +/* --- START OF USER CONFIGURED OPTIONS --- */ + +/* + * 1. BYTE ORDER WITHIN 32 BIT WORDS + * + * The fundamental data processing units in Rijndael are 8-bit bytes. The + * input, output and key input are all enumerated arrays of bytes in which + * bytes are numbered starting at zero and increasing to one less than the + * number of bytes in the array in question. This enumeration is only used + * for naming bytes and does not imply any adjacency or order relationship + * from one byte to another. When these inputs and outputs are considered + * as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to + * byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. + * In this implementation bits are numbered from 0 to 7 starting at the + * numerically least significant end of each byte. Bit n represents 2^n. + * + * However, Rijndael can be implemented more efficiently using 32-bit + * words by packing bytes into words so that bytes 4*n to 4*n+3 are placed + * into word[n]. While in principle these bytes can be assembled into words + * in any positions, this implementation only supports the two formats in + * which bytes in adjacent positions within words also have adjacent byte + * numbers. This order is called big-endian if the lowest numbered bytes + * in words have the highest numeric significance and little-endian if the + * opposite applies. + * + * This code can work in either order irrespective of the order used by the + * machine on which it runs. Normally the internal byte order will be set + * to the order of the processor on which the code is to be run but this + * define can be used to reverse this in special situations + * + * WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set. + * This define will hence be redefined later (in section 4) if necessary + */ + +#if 1 +#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER +#elif 0 +#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 +#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN +#else +#error The algorithm byte order is not defined +#endif + +/* 2. VIA ACE SUPPORT */ + +#if defined(__GNUC__) && defined(__i386__) || \ + defined(_WIN32) && defined(_M_IX86) && \ + !(defined(_WIN64) || defined(_WIN32_WCE) || \ + defined(_MSC_VER) && (_MSC_VER <= 800)) +#define VIA_ACE_POSSIBLE +#endif + +/* + * Define this option if support for the VIA ACE is required. This uses + * inline assembler instructions and is only implemented for the Microsoft, + * Intel and GCC compilers. If VIA ACE is known to be present, then defining + * ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption + * code. If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if + * it is detected (both present and enabled) but the normal AES code will + * also be present. + * + * When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte + * aligned; other input/output buffers do not need to be 16 byte aligned + * but there are very large performance gains if this can be arranged. + * VIA ACE also requires the decryption key schedule to be in reverse + * order (which later checks below ensure). + */ + +/* VIA ACE is not used here for OpenSolaris: */ +#undef VIA_ACE_POSSIBLE +#undef ASSUME_VIA_ACE_PRESENT + +#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(USE_VIA_ACE_IF_PRESENT) +#define USE_VIA_ACE_IF_PRESENT +#endif + +#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(ASSUME_VIA_ACE_PRESENT) +#define ASSUME_VIA_ACE_PRESENT +#endif + + +/* + * 3. ASSEMBLER SUPPORT + * + * This define (which can be on the command line) enables the use of the + * assembler code routines for encryption, decryption and key scheduling + * as follows: + * + * ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for + * encryption and decryption and but with key scheduling in C + * ASM_X86_V2 uses assembler (aes_x86_v2.asm) with compressed tables for + * encryption, decryption and key scheduling + * ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for + * encryption and decryption and but with key scheduling in C + * ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for + * encryption and decryption and but with key scheduling in C + * + * Change one 'if 0' below to 'if 1' to select the version or define + * as a compilation option. + */ + +#if 0 && !defined(ASM_X86_V1C) +#define ASM_X86_V1C +#elif 0 && !defined(ASM_X86_V2) +#define ASM_X86_V2 +#elif 0 && !defined(ASM_X86_V2C) +#define ASM_X86_V2C +#elif 1 && !defined(ASM_AMD64_C) +#define ASM_AMD64_C +#endif + +#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2) || defined(ASM_X86_V2C)) && \ + !defined(_M_IX86) || defined(ASM_AMD64_C) && !defined(_M_X64) && \ + !defined(__amd64) +#error Assembler code is only available for x86 and AMD64 systems +#endif + +/* + * 4. FAST INPUT/OUTPUT OPERATIONS. + * + * On some machines it is possible to improve speed by transferring the + * bytes in the input and output arrays to and from the internal 32-bit + * variables by addressing these arrays as if they are arrays of 32-bit + * words. On some machines this will always be possible but there may + * be a large performance penalty if the byte arrays are not aligned on + * the normal word boundaries. On other machines this technique will + * lead to memory access errors when such 32-bit word accesses are not + * properly aligned. The option SAFE_IO avoids such problems but will + * often be slower on those machines that support misaligned access + * (especially so if care is taken to align the input and output byte + * arrays on 32-bit word boundaries). If SAFE_IO is not defined it is + * assumed that access to byte arrays as if they are arrays of 32-bit + * words will not cause problems when such accesses are misaligned. + */ +#if 1 && !defined(_MSC_VER) +#define SAFE_IO +#endif + +/* + * 5. LOOP UNROLLING + * + * The code for encryption and decryption cycles through a number of rounds + * that can be implemented either in a loop or by expanding the code into a + * long sequence of instructions, the latter producing a larger program but + * one that will often be much faster. The latter is called loop unrolling. + * There are also potential speed advantages in expanding two iterations in + * a loop with half the number of iterations, which is called partial loop + * unrolling. The following options allow partial or full loop unrolling + * to be set independently for encryption and decryption + */ +#if 1 +#define ENC_UNROLL FULL +#elif 0 +#define ENC_UNROLL PARTIAL +#else +#define ENC_UNROLL NONE +#endif + +#if 1 +#define DEC_UNROLL FULL +#elif 0 +#define DEC_UNROLL PARTIAL +#else +#define DEC_UNROLL NONE +#endif + +#if 1 +#define ENC_KS_UNROLL +#endif + +#if 1 +#define DEC_KS_UNROLL +#endif + +/* + * 6. FAST FINITE FIELD OPERATIONS + * + * If this section is included, tables are used to provide faster finite + * field arithmetic. This has no effect if FIXED_TABLES is defined. + */ +#if 1 +#define FF_TABLES +#endif + +/* + * 7. INTERNAL STATE VARIABLE FORMAT + * + * The internal state of Rijndael is stored in a number of local 32-bit + * word variables which can be defined either as an array or as individual + * names variables. Include this section if you want to store these local + * variables in arrays. Otherwise individual local variables will be used. + */ +#if 1 +#define ARRAYS +#endif + +/* + * 8. FIXED OR DYNAMIC TABLES + * + * When this section is included the tables used by the code are compiled + * statically into the binary file. Otherwise the subroutine aes_init() + * must be called to compute them before the code is first used. + */ +#if 1 && !(defined(_MSC_VER) && (_MSC_VER <= 800)) +#define FIXED_TABLES +#endif + +/* + * 9. MASKING OR CASTING FROM LONGER VALUES TO BYTES + * + * In some systems it is better to mask longer values to extract bytes + * rather than using a cast. This option allows this choice. + */ +#if 0 +#define to_byte(x) ((uint8_t)(x)) +#else +#define to_byte(x) ((x) & 0xff) +#endif + +/* + * 10. TABLE ALIGNMENT + * + * On some systems speed will be improved by aligning the AES large lookup + * tables on particular boundaries. This define should be set to a power of + * two giving the desired alignment. It can be left undefined if alignment + * is not needed. This option is specific to the Micrsoft VC++ compiler - + * it seems to sometimes cause trouble for the VC++ version 6 compiler. + */ + +#if 1 && defined(_MSC_VER) && (_MSC_VER >= 1300) +#define TABLE_ALIGN 32 +#endif + +/* + * 11. REDUCE CODE AND TABLE SIZE + * + * This replaces some expanded macros with function calls if AES_ASM_V2 or + * AES_ASM_V2C are defined + */ + +#if 1 && (defined(ASM_X86_V2) || defined(ASM_X86_V2C)) +#define REDUCE_CODE_SIZE +#endif + +/* + * 12. TABLE OPTIONS + * + * This cipher proceeds by repeating in a number of cycles known as rounds + * which are implemented by a round function which is optionally be speeded + * up using tables. The basic tables are 256 32-bit words, with either + * one or four tables being required for each round function depending on + * how much speed is required. Encryption and decryption round functions + * are different and the last encryption and decryption round functions are + * different again making four different round functions in all. + * + * This means that: + * 1. Normal encryption and decryption rounds can each use either 0, 1 + * or 4 tables and table spaces of 0, 1024 or 4096 bytes each. + * 2. The last encryption and decryption rounds can also use either 0, 1 + * or 4 tables and table spaces of 0, 1024 or 4096 bytes each. + * + * Include or exclude the appropriate definitions below to set the number + * of tables used by this implementation. + */ + +#if 1 /* set tables for the normal encryption round */ +#define ENC_ROUND FOUR_TABLES +#elif 0 +#define ENC_ROUND ONE_TABLE +#else +#define ENC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the last encryption round */ +#define LAST_ENC_ROUND FOUR_TABLES +#elif 0 +#define LAST_ENC_ROUND ONE_TABLE +#else +#define LAST_ENC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the normal decryption round */ +#define DEC_ROUND FOUR_TABLES +#elif 0 +#define DEC_ROUND ONE_TABLE +#else +#define DEC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the last decryption round */ +#define LAST_DEC_ROUND FOUR_TABLES +#elif 0 +#define LAST_DEC_ROUND ONE_TABLE +#else +#define LAST_DEC_ROUND NO_TABLES +#endif + +/* + * The decryption key schedule can be speeded up with tables in the same + * way that the round functions can. Include or exclude the following + * defines to set this requirement. + */ +#if 1 +#define KEY_SCHED FOUR_TABLES +#elif 0 +#define KEY_SCHED ONE_TABLE +#else +#define KEY_SCHED NO_TABLES +#endif + +/* ---- END OF USER CONFIGURED OPTIONS ---- */ + +/* VIA ACE support is only available for VC++ and GCC */ + +#if !defined(_MSC_VER) && !defined(__GNUC__) +#if defined(ASSUME_VIA_ACE_PRESENT) +#undef ASSUME_VIA_ACE_PRESENT +#endif +#if defined(USE_VIA_ACE_IF_PRESENT) +#undef USE_VIA_ACE_IF_PRESENT +#endif +#endif + +#if defined(ASSUME_VIA_ACE_PRESENT) && !defined(USE_VIA_ACE_IF_PRESENT) +#define USE_VIA_ACE_IF_PRESENT +#endif + +#if defined(USE_VIA_ACE_IF_PRESENT) && !defined(AES_REV_DKS) +#define AES_REV_DKS +#endif + +/* Assembler support requires the use of platform byte order */ + +#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2C) || defined(ASM_AMD64_C)) && \ + (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER) +#undef ALGORITHM_BYTE_ORDER +#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER +#endif + +/* + * In this implementation the columns of the state array are each held in + * 32-bit words. The state array can be held in various ways: in an array + * of words, in a number of individual word variables or in a number of + * processor registers. The following define maps a variable name x and + * a column number c to the way the state array variable is to be held. + * The first define below maps the state into an array x[c] whereas the + * second form maps the state into a number of individual variables x0, + * x1, etc. Another form could map individual state columns to machine + * register names. + */ + +#if defined(ARRAYS) +#define s(x, c) x[c] +#else +#define s(x, c) x##c +#endif + +/* + * This implementation provides subroutines for encryption, decryption + * and for setting the three key lengths (separately) for encryption + * and decryption. Since not all functions are needed, masks are set + * up here to determine which will be implemented in C + */ + +#if !defined(AES_ENCRYPT) +#define EFUNCS_IN_C 0 +#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \ + defined(ASM_X86_V2C) || defined(ASM_AMD64_C) +#define EFUNCS_IN_C ENC_KEYING_IN_C +#elif !defined(ASM_X86_V2) +#define EFUNCS_IN_C (ENCRYPTION_IN_C | ENC_KEYING_IN_C) +#else +#define EFUNCS_IN_C 0 +#endif + +#if !defined(AES_DECRYPT) +#define DFUNCS_IN_C 0 +#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \ + defined(ASM_X86_V2C) || defined(ASM_AMD64_C) +#define DFUNCS_IN_C DEC_KEYING_IN_C +#elif !defined(ASM_X86_V2) +#define DFUNCS_IN_C (DECRYPTION_IN_C | DEC_KEYING_IN_C) +#else +#define DFUNCS_IN_C 0 +#endif + +#define FUNCS_IN_C (EFUNCS_IN_C | DFUNCS_IN_C) + +/* END OF CONFIGURATION OPTIONS */ + +/* Disable or report errors on some combinations of options */ + +#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES +#undef LAST_ENC_ROUND +#define LAST_ENC_ROUND NO_TABLES +#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES +#undef LAST_ENC_ROUND +#define LAST_ENC_ROUND ONE_TABLE +#endif + +#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE +#undef ENC_UNROLL +#define ENC_UNROLL NONE +#endif + +#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES +#undef LAST_DEC_ROUND +#define LAST_DEC_ROUND NO_TABLES +#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES +#undef LAST_DEC_ROUND +#define LAST_DEC_ROUND ONE_TABLE +#endif + +#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE +#undef DEC_UNROLL +#define DEC_UNROLL NONE +#endif + +#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define aes_sw32 htonl +#elif defined(bswap32) +#define aes_sw32 bswap32 +#elif defined(bswap_32) +#define aes_sw32 bswap_32 +#else +#define brot(x, n) (((uint32_t)(x) << (n)) | ((uint32_t)(x) >> (32 - (n)))) +#define aes_sw32(x) ((brot((x), 8) & 0x00ff00ff) | (brot((x), 24) & 0xff00ff00)) +#endif + + +/* + * upr(x, n): rotates bytes within words by n positions, moving bytes to + * higher index positions with wrap around into low positions + * ups(x, n): moves bytes by n positions to higher index positions in + * words but without wrap around + * bval(x, n): extracts a byte from a word + * + * WARNING: The definitions given here are intended only for use with + * unsigned variables and with shift counts that are compile + * time constants + */ + +#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define upr(x, n) (((uint32_t)(x) << (8 * (n))) | \ + ((uint32_t)(x) >> (32 - 8 * (n)))) +#define ups(x, n) ((uint32_t)(x) << (8 * (n))) +#define bval(x, n) to_byte((x) >> (8 * (n))) +#define bytes2word(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | \ + ((uint32_t)(b1) << 8) | (b0)) +#endif + +#if (ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN) +#define upr(x, n) (((uint32_t)(x) >> (8 * (n))) | \ + ((uint32_t)(x) << (32 - 8 * (n)))) +#define ups(x, n) ((uint32_t)(x) >> (8 * (n))) +#define bval(x, n) to_byte((x) >> (24 - 8 * (n))) +#define bytes2word(b0, b1, b2, b3) \ + (((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | \ + ((uint32_t)(b2) << 8) | (b3)) +#endif + +#if defined(SAFE_IO) +#define word_in(x, c) bytes2word(((const uint8_t *)(x) + 4 * c)[0], \ + ((const uint8_t *)(x) + 4 * c)[1], \ + ((const uint8_t *)(x) + 4 * c)[2], \ + ((const uint8_t *)(x) + 4 * c)[3]) +#define word_out(x, c, v) { ((uint8_t *)(x) + 4 * c)[0] = bval(v, 0); \ + ((uint8_t *)(x) + 4 * c)[1] = bval(v, 1); \ + ((uint8_t *)(x) + 4 * c)[2] = bval(v, 2); \ + ((uint8_t *)(x) + 4 * c)[3] = bval(v, 3); } +#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) +#define word_in(x, c) (*((uint32_t *)(x) + (c))) +#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = (v)) +#else +#define word_in(x, c) aes_sw32(*((uint32_t *)(x) + (c))) +#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = aes_sw32(v)) +#endif + +/* the finite field modular polynomial and elements */ + +#define WPOLY 0x011b +#define BPOLY 0x1b + +/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ + +#define m1 0x80808080 +#define m2 0x7f7f7f7f +#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) + +/* + * The following defines provide alternative definitions of gf_mulx that might + * give improved performance if a fast 32-bit multiply is not available. Note + * that a temporary variable u needs to be defined where gf_mulx is used. + * + * #define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ \ + * ((u >> 3) | (u >> 6)) + * #define m4 (0x01010101 * BPOLY) + * #define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) \ + * & m4) + */ + +/* Work out which tables are needed for the different options */ + +#if defined(ASM_X86_V1C) +#if defined(ENC_ROUND) +#undef ENC_ROUND +#endif +#define ENC_ROUND FOUR_TABLES +#if defined(LAST_ENC_ROUND) +#undef LAST_ENC_ROUND +#endif +#define LAST_ENC_ROUND FOUR_TABLES +#if defined(DEC_ROUND) +#undef DEC_ROUND +#endif +#define DEC_ROUND FOUR_TABLES +#if defined(LAST_DEC_ROUND) +#undef LAST_DEC_ROUND +#endif +#define LAST_DEC_ROUND FOUR_TABLES +#if defined(KEY_SCHED) +#undef KEY_SCHED +#define KEY_SCHED FOUR_TABLES +#endif +#endif + +#if (FUNCS_IN_C & ENCRYPTION_IN_C) || defined(ASM_X86_V1C) +#if ENC_ROUND == ONE_TABLE +#define FT1_SET +#elif ENC_ROUND == FOUR_TABLES +#define FT4_SET +#else +#define SBX_SET +#endif +#if LAST_ENC_ROUND == ONE_TABLE +#define FL1_SET +#elif LAST_ENC_ROUND == FOUR_TABLES +#define FL4_SET +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif + +#if (FUNCS_IN_C & DECRYPTION_IN_C) || defined(ASM_X86_V1C) +#if DEC_ROUND == ONE_TABLE +#define IT1_SET +#elif DEC_ROUND == FOUR_TABLES +#define IT4_SET +#else +#define ISB_SET +#endif +#if LAST_DEC_ROUND == ONE_TABLE +#define IL1_SET +#elif LAST_DEC_ROUND == FOUR_TABLES +#define IL4_SET +#elif !defined(ISB_SET) +#define ISB_SET +#endif +#endif + + +#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \ + defined(ASM_X86_V2C))) +#if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C)) +#if KEY_SCHED == ONE_TABLE +#if !defined(FL1_SET) && !defined(FL4_SET) +#define LS1_SET +#endif +#elif KEY_SCHED == FOUR_TABLES +#if !defined(FL4_SET) +#define LS4_SET +#endif +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif +#if (FUNCS_IN_C & DEC_KEYING_IN_C) +#if KEY_SCHED == ONE_TABLE +#define IM1_SET +#elif KEY_SCHED == FOUR_TABLES +#define IM4_SET +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif +#endif + +/* generic definitions of Rijndael macros that use tables */ + +#define no_table(x, box, vf, rf, c) bytes2word(\ + box[bval(vf(x, 0, c), rf(0, c))], \ + box[bval(vf(x, 1, c), rf(1, c))], \ + box[bval(vf(x, 2, c), rf(2, c))], \ + box[bval(vf(x, 3, c), rf(3, c))]) + +#define one_table(x, op, tab, vf, rf, c) \ + (tab[bval(vf(x, 0, c), rf(0, c))] \ + ^ op(tab[bval(vf(x, 1, c), rf(1, c))], 1) \ + ^ op(tab[bval(vf(x, 2, c), rf(2, c))], 2) \ + ^ op(tab[bval(vf(x, 3, c), rf(3, c))], 3)) + +#define four_tables(x, tab, vf, rf, c) \ + (tab[0][bval(vf(x, 0, c), rf(0, c))] \ + ^ tab[1][bval(vf(x, 1, c), rf(1, c))] \ + ^ tab[2][bval(vf(x, 2, c), rf(2, c))] \ + ^ tab[3][bval(vf(x, 3, c), rf(3, c))]) + +#define vf1(x, r, c) (x) +#define rf1(r, c) (r) +#define rf2(r, c) ((8+r-c)&3) + +/* + * Perform forward and inverse column mix operation on four bytes in long word + * x in parallel. NOTE: x must be a simple variable, NOT an expression in + * these macros. + */ + +#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \ + defined(ASM_X86_V2C))) + +#if defined(FM4_SET) /* not currently used */ +#define fwd_mcol(x) four_tables(x, t_use(f, m), vf1, rf1, 0) +#elif defined(FM1_SET) /* not currently used */ +#define fwd_mcol(x) one_table(x, upr, t_use(f, m), vf1, rf1, 0) +#else +#define dec_fmvars uint32_t g2 +#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ \ + upr((x), 2) ^ upr((x), 1)) +#endif + +#if defined(IM4_SET) +#define inv_mcol(x) four_tables(x, t_use(i, m), vf1, rf1, 0) +#elif defined(IM1_SET) +#define inv_mcol(x) one_table(x, upr, t_use(i, m), vf1, rf1, 0) +#else +#define dec_imvars uint32_t g2, g4, g9 +#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = \ + (x) ^ gf_mulx(g4), g4 ^= g9, \ + (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ \ + upr(g4, 2) ^ upr(g9, 1)) +#endif + +#if defined(FL4_SET) +#define ls_box(x, c) four_tables(x, t_use(f, l), vf1, rf2, c) +#elif defined(LS4_SET) +#define ls_box(x, c) four_tables(x, t_use(l, s), vf1, rf2, c) +#elif defined(FL1_SET) +#define ls_box(x, c) one_table(x, upr, t_use(f, l), vf1, rf2, c) +#elif defined(LS1_SET) +#define ls_box(x, c) one_table(x, upr, t_use(l, s), vf1, rf2, c) +#else +#define ls_box(x, c) no_table(x, t_use(s, box), vf1, rf2, c) +#endif + +#endif + +#if defined(ASM_X86_V1C) && defined(AES_DECRYPT) && !defined(ISB_SET) +#define ISB_SET +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _AESOPT_H */ diff --git a/module/icp/asm-x86_64/aes/aestab.h b/module/icp/asm-x86_64/aes/aestab.h new file mode 100644 index 000000000..33cdb6c6f --- /dev/null +++ b/module/icp/asm-x86_64/aes/aestab.h @@ -0,0 +1,165 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue Date: 20/12/2007 + * + * This file contains the code for declaring the tables needed to implement + * AES. The file aesopt.h is assumed to be included before this header file. + * If there are no global variables, the definitions here can be used to put + * the AES tables in a structure so that a pointer can then be added to the + * AES context to pass them to the AES routines that need them. If this + * facility is used, the calling program has to ensure that this pointer is + * managed appropriately. In particular, the value of the t_dec(in, it) item + * in the table structure must be set to zero in order to ensure that the + * tables are initialised. In practice the three code sequences in aeskey.c + * that control the calls to aes_init() and the aes_init() routine itself will + * have to be changed for a specific implementation. If global variables are + * available it will generally be preferable to use them with the precomputed + * FIXED_TABLES option that uses static global tables. + * + * The following defines can be used to control the way the tables + * are defined, initialised and used in embedded environments that + * require special features for these purposes + * + * the 't_dec' construction is used to declare fixed table arrays + * the 't_set' construction is used to set fixed table values + * the 't_use' construction is used to access fixed table values + * + * 256 byte tables: + * + * t_xxx(s, box) => forward S box + * t_xxx(i, box) => inverse S box + * + * 256 32-bit word OR 4 x 256 32-bit word tables: + * + * t_xxx(f, n) => forward normal round + * t_xxx(f, l) => forward last round + * t_xxx(i, n) => inverse normal round + * t_xxx(i, l) => inverse last round + * t_xxx(l, s) => key schedule table + * t_xxx(i, m) => key schedule table + * + * Other variables and tables: + * + * t_xxx(r, c) => the rcon table + */ + +/* + * OpenSolaris OS modifications + * + * 1. Added __cplusplus and _AESTAB_H header guards + * 2. Added header file sys/types.h + * 3. Remove code defined for _MSC_VER + * 4. Changed all variables to "static const" + * 5. Changed uint_8t and uint_32t to uint8_t and uint32_t + * 6. Cstyled and hdrchk code + */ + +#ifndef _AESTAB_H +#define _AESTAB_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> + +#define t_dec(m, n) t_##m##n +#define t_set(m, n) t_##m##n +#define t_use(m, n) t_##m##n + +#if defined(DO_TABLES) && defined(FIXED_TABLES) +#define d_1(t, n, b, e) static const t n[256] = b(e) +#define d_4(t, n, b, e, f, g, h) static const t n[4][256] = \ + {b(e), b(f), b(g), b(h)} +static const uint32_t t_dec(r, c)[RC_LENGTH] = rc_data(w0); +#else +#define d_1(t, n, b, e) static const t n[256] +#define d_4(t, n, b, e, f, g, h) static const t n[4][256] +static const uint32_t t_dec(r, c)[RC_LENGTH]; +#endif + +#if defined(SBX_SET) + d_1(uint8_t, t_dec(s, box), sb_data, h0); +#endif +#if defined(ISB_SET) + d_1(uint8_t, t_dec(i, box), isb_data, h0); +#endif + +#if defined(FT1_SET) + d_1(uint32_t, t_dec(f, n), sb_data, u0); +#endif +#if defined(FT4_SET) + d_4(uint32_t, t_dec(f, n), sb_data, u0, u1, u2, u3); +#endif + +#if defined(FL1_SET) + d_1(uint32_t, t_dec(f, l), sb_data, w0); +#endif +#if defined(FL4_SET) + d_4(uint32_t, t_dec(f, l), sb_data, w0, w1, w2, w3); +#endif + +#if defined(IT1_SET) + d_1(uint32_t, t_dec(i, n), isb_data, v0); +#endif +#if defined(IT4_SET) + d_4(uint32_t, t_dec(i, n), isb_data, v0, v1, v2, v3); +#endif + +#if defined(IL1_SET) + d_1(uint32_t, t_dec(i, l), isb_data, w0); +#endif +#if defined(IL4_SET) + d_4(uint32_t, t_dec(i, l), isb_data, w0, w1, w2, w3); +#endif + +#if defined(LS1_SET) +#if defined(FL1_SET) +#undef LS1_SET +#else + d_1(uint32_t, t_dec(l, s), sb_data, w0); +#endif +#endif + +#if defined(LS4_SET) +#if defined(FL4_SET) +#undef LS4_SET +#else + d_4(uint32_t, t_dec(l, s), sb_data, w0, w1, w2, w3); +#endif +#endif + +#if defined(IM1_SET) + d_1(uint32_t, t_dec(i, m), mm_data, v0); +#endif +#if defined(IM4_SET) + d_4(uint32_t, t_dec(i, m), mm_data, v0, v1, v2, v3); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _AESTAB_H */ diff --git a/module/icp/asm-x86_64/aes/aestab2.h b/module/icp/asm-x86_64/aes/aestab2.h new file mode 100644 index 000000000..eb13f72b1 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aestab2.h @@ -0,0 +1,594 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AESTAB2_H +#define _AESTAB2_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * To create this file for OpenSolaris: + * 1. Compile and run tablegen.c, from aes-src-04-03-08.zip, + * after defining ASM_AMD64_C + * 2. mv aestab2.c aestab2.h + * 3. Add __cplusplus and _AESTAB2_H header guards + * 3. Add #include <aes_impl.h> + * 4. Change "uint_32t" to "uint32_t" + * 5. Change all variables to "static const" + * 6. Cstyle and hdrchk this file + */ + +#include <aes/aes_impl.h> + +static const uint32_t t_rc[RC_LENGTH] = +{ + 0x00000001, 0x00000002, 0x00000004, 0x00000008, + 0x00000010, 0x00000020, 0x00000040, 0x00000080, + 0x0000001b, 0x00000036 +}; + +static const uint32_t t_ls[4][256] = +{ + { + 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, + 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, + 0x00000030, 0x00000001, 0x00000067, 0x0000002b, + 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076, + 0x000000ca, 0x00000082, 0x000000c9, 0x0000007d, + 0x000000fa, 0x00000059, 0x00000047, 0x000000f0, + 0x000000ad, 0x000000d4, 0x000000a2, 0x000000af, + 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0, + 0x000000b7, 0x000000fd, 0x00000093, 0x00000026, + 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc, + 0x00000034, 0x000000a5, 0x000000e5, 0x000000f1, + 0x00000071, 0x000000d8, 0x00000031, 0x00000015, + 0x00000004, 0x000000c7, 0x00000023, 0x000000c3, + 0x00000018, 0x00000096, 0x00000005, 0x0000009a, + 0x00000007, 0x00000012, 0x00000080, 0x000000e2, + 0x000000eb, 0x00000027, 0x000000b2, 0x00000075, + 0x00000009, 0x00000083, 0x0000002c, 0x0000001a, + 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0, + 0x00000052, 0x0000003b, 0x000000d6, 0x000000b3, + 0x00000029, 0x000000e3, 0x0000002f, 0x00000084, + 0x00000053, 0x000000d1, 0x00000000, 0x000000ed, + 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b, + 0x0000006a, 0x000000cb, 0x000000be, 0x00000039, + 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf, + 0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb, + 0x00000043, 0x0000004d, 0x00000033, 0x00000085, + 0x00000045, 0x000000f9, 0x00000002, 0x0000007f, + 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8, + 0x00000051, 0x000000a3, 0x00000040, 0x0000008f, + 0x00000092, 0x0000009d, 0x00000038, 0x000000f5, + 0x000000bc, 0x000000b6, 0x000000da, 0x00000021, + 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2, + 0x000000cd, 0x0000000c, 0x00000013, 0x000000ec, + 0x0000005f, 0x00000097, 0x00000044, 0x00000017, + 0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d, + 0x00000064, 0x0000005d, 0x00000019, 0x00000073, + 0x00000060, 0x00000081, 0x0000004f, 0x000000dc, + 0x00000022, 0x0000002a, 0x00000090, 0x00000088, + 0x00000046, 0x000000ee, 0x000000b8, 0x00000014, + 0x000000de, 0x0000005e, 0x0000000b, 0x000000db, + 0x000000e0, 0x00000032, 0x0000003a, 0x0000000a, + 0x00000049, 0x00000006, 0x00000024, 0x0000005c, + 0x000000c2, 0x000000d3, 0x000000ac, 0x00000062, + 0x00000091, 0x00000095, 0x000000e4, 0x00000079, + 0x000000e7, 0x000000c8, 0x00000037, 0x0000006d, + 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9, + 0x0000006c, 0x00000056, 0x000000f4, 0x000000ea, + 0x00000065, 0x0000007a, 0x000000ae, 0x00000008, + 0x000000ba, 0x00000078, 0x00000025, 0x0000002e, + 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6, + 0x000000e8, 0x000000dd, 0x00000074, 0x0000001f, + 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a, + 0x00000070, 0x0000003e, 0x000000b5, 0x00000066, + 0x00000048, 0x00000003, 0x000000f6, 0x0000000e, + 0x00000061, 0x00000035, 0x00000057, 0x000000b9, + 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e, + 0x000000e1, 0x000000f8, 0x00000098, 0x00000011, + 0x00000069, 0x000000d9, 0x0000008e, 0x00000094, + 0x0000009b, 0x0000001e, 0x00000087, 0x000000e9, + 0x000000ce, 0x00000055, 0x00000028, 0x000000df, + 0x0000008c, 0x000000a1, 0x00000089, 0x0000000d, + 0x000000bf, 0x000000e6, 0x00000042, 0x00000068, + 0x00000041, 0x00000099, 0x0000002d, 0x0000000f, + 0x000000b0, 0x00000054, 0x000000bb, 0x00000016 + }, + { + 0x00006300, 0x00007c00, 0x00007700, 0x00007b00, + 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500, + 0x00003000, 0x00000100, 0x00006700, 0x00002b00, + 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600, + 0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00, + 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000, + 0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00, + 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000, + 0x0000b700, 0x0000fd00, 0x00009300, 0x00002600, + 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00, + 0x00003400, 0x0000a500, 0x0000e500, 0x0000f100, + 0x00007100, 0x0000d800, 0x00003100, 0x00001500, + 0x00000400, 0x0000c700, 0x00002300, 0x0000c300, + 0x00001800, 0x00009600, 0x00000500, 0x00009a00, + 0x00000700, 0x00001200, 0x00008000, 0x0000e200, + 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500, + 0x00000900, 0x00008300, 0x00002c00, 0x00001a00, + 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000, + 0x00005200, 0x00003b00, 0x0000d600, 0x0000b300, + 0x00002900, 0x0000e300, 0x00002f00, 0x00008400, + 0x00005300, 0x0000d100, 0x00000000, 0x0000ed00, + 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00, + 0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900, + 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00, + 0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00, + 0x00004300, 0x00004d00, 0x00003300, 0x00008500, + 0x00004500, 0x0000f900, 0x00000200, 0x00007f00, + 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800, + 0x00005100, 0x0000a300, 0x00004000, 0x00008f00, + 0x00009200, 0x00009d00, 0x00003800, 0x0000f500, + 0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100, + 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200, + 0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00, + 0x00005f00, 0x00009700, 0x00004400, 0x00001700, + 0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00, + 0x00006400, 0x00005d00, 0x00001900, 0x00007300, + 0x00006000, 0x00008100, 0x00004f00, 0x0000dc00, + 0x00002200, 0x00002a00, 0x00009000, 0x00008800, + 0x00004600, 0x0000ee00, 0x0000b800, 0x00001400, + 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00, + 0x0000e000, 0x00003200, 0x00003a00, 0x00000a00, + 0x00004900, 0x00000600, 0x00002400, 0x00005c00, + 0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200, + 0x00009100, 0x00009500, 0x0000e400, 0x00007900, + 0x0000e700, 0x0000c800, 0x00003700, 0x00006d00, + 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900, + 0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00, + 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800, + 0x0000ba00, 0x00007800, 0x00002500, 0x00002e00, + 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600, + 0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00, + 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00, + 0x00007000, 0x00003e00, 0x0000b500, 0x00006600, + 0x00004800, 0x00000300, 0x0000f600, 0x00000e00, + 0x00006100, 0x00003500, 0x00005700, 0x0000b900, + 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00, + 0x0000e100, 0x0000f800, 0x00009800, 0x00001100, + 0x00006900, 0x0000d900, 0x00008e00, 0x00009400, + 0x00009b00, 0x00001e00, 0x00008700, 0x0000e900, + 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00, + 0x00008c00, 0x0000a100, 0x00008900, 0x00000d00, + 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800, + 0x00004100, 0x00009900, 0x00002d00, 0x00000f00, + 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600 + }, + { + 0x00630000, 0x007c0000, 0x00770000, 0x007b0000, + 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000, + 0x00300000, 0x00010000, 0x00670000, 0x002b0000, + 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000, + 0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000, + 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000, + 0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000, + 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000, + 0x00b70000, 0x00fd0000, 0x00930000, 0x00260000, + 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000, + 0x00340000, 0x00a50000, 0x00e50000, 0x00f10000, + 0x00710000, 0x00d80000, 0x00310000, 0x00150000, + 0x00040000, 0x00c70000, 0x00230000, 0x00c30000, + 0x00180000, 0x00960000, 0x00050000, 0x009a0000, + 0x00070000, 0x00120000, 0x00800000, 0x00e20000, + 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000, + 0x00090000, 0x00830000, 0x002c0000, 0x001a0000, + 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000, + 0x00520000, 0x003b0000, 0x00d60000, 0x00b30000, + 0x00290000, 0x00e30000, 0x002f0000, 0x00840000, + 0x00530000, 0x00d10000, 0x00000000, 0x00ed0000, + 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000, + 0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000, + 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000, + 0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000, + 0x00430000, 0x004d0000, 0x00330000, 0x00850000, + 0x00450000, 0x00f90000, 0x00020000, 0x007f0000, + 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000, + 0x00510000, 0x00a30000, 0x00400000, 0x008f0000, + 0x00920000, 0x009d0000, 0x00380000, 0x00f50000, + 0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000, + 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000, + 0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000, + 0x005f0000, 0x00970000, 0x00440000, 0x00170000, + 0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000, + 0x00640000, 0x005d0000, 0x00190000, 0x00730000, + 0x00600000, 0x00810000, 0x004f0000, 0x00dc0000, + 0x00220000, 0x002a0000, 0x00900000, 0x00880000, + 0x00460000, 0x00ee0000, 0x00b80000, 0x00140000, + 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000, + 0x00e00000, 0x00320000, 0x003a0000, 0x000a0000, + 0x00490000, 0x00060000, 0x00240000, 0x005c0000, + 0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000, + 0x00910000, 0x00950000, 0x00e40000, 0x00790000, + 0x00e70000, 0x00c80000, 0x00370000, 0x006d0000, + 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000, + 0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000, + 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000, + 0x00ba0000, 0x00780000, 0x00250000, 0x002e0000, + 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000, + 0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000, + 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000, + 0x00700000, 0x003e0000, 0x00b50000, 0x00660000, + 0x00480000, 0x00030000, 0x00f60000, 0x000e0000, + 0x00610000, 0x00350000, 0x00570000, 0x00b90000, + 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000, + 0x00e10000, 0x00f80000, 0x00980000, 0x00110000, + 0x00690000, 0x00d90000, 0x008e0000, 0x00940000, + 0x009b0000, 0x001e0000, 0x00870000, 0x00e90000, + 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000, + 0x008c0000, 0x00a10000, 0x00890000, 0x000d0000, + 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000, + 0x00410000, 0x00990000, 0x002d0000, 0x000f0000, + 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000 + }, + { + 0x63000000, 0x7c000000, 0x77000000, 0x7b000000, + 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000, + 0x30000000, 0x01000000, 0x67000000, 0x2b000000, + 0xfe000000, 0xd7000000, 0xab000000, 0x76000000, + 0xca000000, 0x82000000, 0xc9000000, 0x7d000000, + 0xfa000000, 0x59000000, 0x47000000, 0xf0000000, + 0xad000000, 0xd4000000, 0xa2000000, 0xaf000000, + 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000, + 0xb7000000, 0xfd000000, 0x93000000, 0x26000000, + 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000, + 0x34000000, 0xa5000000, 0xe5000000, 0xf1000000, + 0x71000000, 0xd8000000, 0x31000000, 0x15000000, + 0x04000000, 0xc7000000, 0x23000000, 0xc3000000, + 0x18000000, 0x96000000, 0x05000000, 0x9a000000, + 0x07000000, 0x12000000, 0x80000000, 0xe2000000, + 0xeb000000, 0x27000000, 0xb2000000, 0x75000000, + 0x09000000, 0x83000000, 0x2c000000, 0x1a000000, + 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000, + 0x52000000, 0x3b000000, 0xd6000000, 0xb3000000, + 0x29000000, 0xe3000000, 0x2f000000, 0x84000000, + 0x53000000, 0xd1000000, 0x00000000, 0xed000000, + 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000, + 0x6a000000, 0xcb000000, 0xbe000000, 0x39000000, + 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000, + 0xd0000000, 0xef000000, 0xaa000000, 0xfb000000, + 0x43000000, 0x4d000000, 0x33000000, 0x85000000, + 0x45000000, 0xf9000000, 0x02000000, 0x7f000000, + 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000, + 0x51000000, 0xa3000000, 0x40000000, 0x8f000000, + 0x92000000, 0x9d000000, 0x38000000, 0xf5000000, + 0xbc000000, 0xb6000000, 0xda000000, 0x21000000, + 0x10000000, 0xff000000, 0xf3000000, 0xd2000000, + 0xcd000000, 0x0c000000, 0x13000000, 0xec000000, + 0x5f000000, 0x97000000, 0x44000000, 0x17000000, + 0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000, + 0x64000000, 0x5d000000, 0x19000000, 0x73000000, + 0x60000000, 0x81000000, 0x4f000000, 0xdc000000, + 0x22000000, 0x2a000000, 0x90000000, 0x88000000, + 0x46000000, 0xee000000, 0xb8000000, 0x14000000, + 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000, + 0xe0000000, 0x32000000, 0x3a000000, 0x0a000000, + 0x49000000, 0x06000000, 0x24000000, 0x5c000000, + 0xc2000000, 0xd3000000, 0xac000000, 0x62000000, + 0x91000000, 0x95000000, 0xe4000000, 0x79000000, + 0xe7000000, 0xc8000000, 0x37000000, 0x6d000000, + 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000, + 0x6c000000, 0x56000000, 0xf4000000, 0xea000000, + 0x65000000, 0x7a000000, 0xae000000, 0x08000000, + 0xba000000, 0x78000000, 0x25000000, 0x2e000000, + 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000, + 0xe8000000, 0xdd000000, 0x74000000, 0x1f000000, + 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000, + 0x70000000, 0x3e000000, 0xb5000000, 0x66000000, + 0x48000000, 0x03000000, 0xf6000000, 0x0e000000, + 0x61000000, 0x35000000, 0x57000000, 0xb9000000, + 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000, + 0xe1000000, 0xf8000000, 0x98000000, 0x11000000, + 0x69000000, 0xd9000000, 0x8e000000, 0x94000000, + 0x9b000000, 0x1e000000, 0x87000000, 0xe9000000, + 0xce000000, 0x55000000, 0x28000000, 0xdf000000, + 0x8c000000, 0xa1000000, 0x89000000, 0x0d000000, + 0xbf000000, 0xe6000000, 0x42000000, 0x68000000, + 0x41000000, 0x99000000, 0x2d000000, 0x0f000000, + 0xb0000000, 0x54000000, 0xbb000000, 0x16000000 + } +}; + +static const uint32_t t_im[4][256] = +{ + { + 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12, + 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a, + 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362, + 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a, + 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2, + 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca, + 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382, + 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba, + 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9, + 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1, + 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9, + 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81, + 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029, + 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411, + 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859, + 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61, + 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf, + 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987, + 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf, + 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7, + 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f, + 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967, + 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f, + 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117, + 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664, + 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c, + 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14, + 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c, + 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684, + 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc, + 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4, + 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc, + 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753, + 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b, + 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23, + 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b, + 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3, + 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b, + 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3, + 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb, + 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88, + 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0, + 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8, + 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0, + 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68, + 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850, + 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418, + 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020, + 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe, + 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6, + 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e, + 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6, + 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e, + 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526, + 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e, + 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56, + 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25, + 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d, + 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255, + 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d, + 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5, + 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd, + 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5, + 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d + }, + { + 0x00000000, 0x0d090e0b, 0x1a121c16, 0x171b121d, + 0x3424382c, 0x392d3627, 0x2e36243a, 0x233f2a31, + 0x68487058, 0x65417e53, 0x725a6c4e, 0x7f536245, + 0x5c6c4874, 0x5165467f, 0x467e5462, 0x4b775a69, + 0xd090e0b0, 0xdd99eebb, 0xca82fca6, 0xc78bf2ad, + 0xe4b4d89c, 0xe9bdd697, 0xfea6c48a, 0xf3afca81, + 0xb8d890e8, 0xb5d19ee3, 0xa2ca8cfe, 0xafc382f5, + 0x8cfca8c4, 0x81f5a6cf, 0x96eeb4d2, 0x9be7bad9, + 0xbb3bdb7b, 0xb632d570, 0xa129c76d, 0xac20c966, + 0x8f1fe357, 0x8216ed5c, 0x950dff41, 0x9804f14a, + 0xd373ab23, 0xde7aa528, 0xc961b735, 0xc468b93e, + 0xe757930f, 0xea5e9d04, 0xfd458f19, 0xf04c8112, + 0x6bab3bcb, 0x66a235c0, 0x71b927dd, 0x7cb029d6, + 0x5f8f03e7, 0x52860dec, 0x459d1ff1, 0x489411fa, + 0x03e34b93, 0x0eea4598, 0x19f15785, 0x14f8598e, + 0x37c773bf, 0x3ace7db4, 0x2dd56fa9, 0x20dc61a2, + 0x6d76adf6, 0x607fa3fd, 0x7764b1e0, 0x7a6dbfeb, + 0x595295da, 0x545b9bd1, 0x434089cc, 0x4e4987c7, + 0x053eddae, 0x0837d3a5, 0x1f2cc1b8, 0x1225cfb3, + 0x311ae582, 0x3c13eb89, 0x2b08f994, 0x2601f79f, + 0xbde64d46, 0xb0ef434d, 0xa7f45150, 0xaafd5f5b, + 0x89c2756a, 0x84cb7b61, 0x93d0697c, 0x9ed96777, + 0xd5ae3d1e, 0xd8a73315, 0xcfbc2108, 0xc2b52f03, + 0xe18a0532, 0xec830b39, 0xfb981924, 0xf691172f, + 0xd64d768d, 0xdb447886, 0xcc5f6a9b, 0xc1566490, + 0xe2694ea1, 0xef6040aa, 0xf87b52b7, 0xf5725cbc, + 0xbe0506d5, 0xb30c08de, 0xa4171ac3, 0xa91e14c8, + 0x8a213ef9, 0x872830f2, 0x903322ef, 0x9d3a2ce4, + 0x06dd963d, 0x0bd49836, 0x1ccf8a2b, 0x11c68420, + 0x32f9ae11, 0x3ff0a01a, 0x28ebb207, 0x25e2bc0c, + 0x6e95e665, 0x639ce86e, 0x7487fa73, 0x798ef478, + 0x5ab1de49, 0x57b8d042, 0x40a3c25f, 0x4daacc54, + 0xdaec41f7, 0xd7e54ffc, 0xc0fe5de1, 0xcdf753ea, + 0xeec879db, 0xe3c177d0, 0xf4da65cd, 0xf9d36bc6, + 0xb2a431af, 0xbfad3fa4, 0xa8b62db9, 0xa5bf23b2, + 0x86800983, 0x8b890788, 0x9c921595, 0x919b1b9e, + 0x0a7ca147, 0x0775af4c, 0x106ebd51, 0x1d67b35a, + 0x3e58996b, 0x33519760, 0x244a857d, 0x29438b76, + 0x6234d11f, 0x6f3ddf14, 0x7826cd09, 0x752fc302, + 0x5610e933, 0x5b19e738, 0x4c02f525, 0x410bfb2e, + 0x61d79a8c, 0x6cde9487, 0x7bc5869a, 0x76cc8891, + 0x55f3a2a0, 0x58faacab, 0x4fe1beb6, 0x42e8b0bd, + 0x099fead4, 0x0496e4df, 0x138df6c2, 0x1e84f8c9, + 0x3dbbd2f8, 0x30b2dcf3, 0x27a9ceee, 0x2aa0c0e5, + 0xb1477a3c, 0xbc4e7437, 0xab55662a, 0xa65c6821, + 0x85634210, 0x886a4c1b, 0x9f715e06, 0x9278500d, + 0xd90f0a64, 0xd406046f, 0xc31d1672, 0xce141879, + 0xed2b3248, 0xe0223c43, 0xf7392e5e, 0xfa302055, + 0xb79aec01, 0xba93e20a, 0xad88f017, 0xa081fe1c, + 0x83bed42d, 0x8eb7da26, 0x99acc83b, 0x94a5c630, + 0xdfd29c59, 0xd2db9252, 0xc5c0804f, 0xc8c98e44, + 0xebf6a475, 0xe6ffaa7e, 0xf1e4b863, 0xfcedb668, + 0x670a0cb1, 0x6a0302ba, 0x7d1810a7, 0x70111eac, + 0x532e349d, 0x5e273a96, 0x493c288b, 0x44352680, + 0x0f427ce9, 0x024b72e2, 0x155060ff, 0x18596ef4, + 0x3b6644c5, 0x366f4ace, 0x217458d3, 0x2c7d56d8, + 0x0ca1377a, 0x01a83971, 0x16b32b6c, 0x1bba2567, + 0x38850f56, 0x358c015d, 0x22971340, 0x2f9e1d4b, + 0x64e94722, 0x69e04929, 0x7efb5b34, 0x73f2553f, + 0x50cd7f0e, 0x5dc47105, 0x4adf6318, 0x47d66d13, + 0xdc31d7ca, 0xd138d9c1, 0xc623cbdc, 0xcb2ac5d7, + 0xe815efe6, 0xe51ce1ed, 0xf207f3f0, 0xff0efdfb, + 0xb479a792, 0xb970a999, 0xae6bbb84, 0xa362b58f, + 0x805d9fbe, 0x8d5491b5, 0x9a4f83a8, 0x97468da3 + }, + { + 0x00000000, 0x090e0b0d, 0x121c161a, 0x1b121d17, + 0x24382c34, 0x2d362739, 0x36243a2e, 0x3f2a3123, + 0x48705868, 0x417e5365, 0x5a6c4e72, 0x5362457f, + 0x6c48745c, 0x65467f51, 0x7e546246, 0x775a694b, + 0x90e0b0d0, 0x99eebbdd, 0x82fca6ca, 0x8bf2adc7, + 0xb4d89ce4, 0xbdd697e9, 0xa6c48afe, 0xafca81f3, + 0xd890e8b8, 0xd19ee3b5, 0xca8cfea2, 0xc382f5af, + 0xfca8c48c, 0xf5a6cf81, 0xeeb4d296, 0xe7bad99b, + 0x3bdb7bbb, 0x32d570b6, 0x29c76da1, 0x20c966ac, + 0x1fe3578f, 0x16ed5c82, 0x0dff4195, 0x04f14a98, + 0x73ab23d3, 0x7aa528de, 0x61b735c9, 0x68b93ec4, + 0x57930fe7, 0x5e9d04ea, 0x458f19fd, 0x4c8112f0, + 0xab3bcb6b, 0xa235c066, 0xb927dd71, 0xb029d67c, + 0x8f03e75f, 0x860dec52, 0x9d1ff145, 0x9411fa48, + 0xe34b9303, 0xea45980e, 0xf1578519, 0xf8598e14, + 0xc773bf37, 0xce7db43a, 0xd56fa92d, 0xdc61a220, + 0x76adf66d, 0x7fa3fd60, 0x64b1e077, 0x6dbfeb7a, + 0x5295da59, 0x5b9bd154, 0x4089cc43, 0x4987c74e, + 0x3eddae05, 0x37d3a508, 0x2cc1b81f, 0x25cfb312, + 0x1ae58231, 0x13eb893c, 0x08f9942b, 0x01f79f26, + 0xe64d46bd, 0xef434db0, 0xf45150a7, 0xfd5f5baa, + 0xc2756a89, 0xcb7b6184, 0xd0697c93, 0xd967779e, + 0xae3d1ed5, 0xa73315d8, 0xbc2108cf, 0xb52f03c2, + 0x8a0532e1, 0x830b39ec, 0x981924fb, 0x91172ff6, + 0x4d768dd6, 0x447886db, 0x5f6a9bcc, 0x566490c1, + 0x694ea1e2, 0x6040aaef, 0x7b52b7f8, 0x725cbcf5, + 0x0506d5be, 0x0c08deb3, 0x171ac3a4, 0x1e14c8a9, + 0x213ef98a, 0x2830f287, 0x3322ef90, 0x3a2ce49d, + 0xdd963d06, 0xd498360b, 0xcf8a2b1c, 0xc6842011, + 0xf9ae1132, 0xf0a01a3f, 0xebb20728, 0xe2bc0c25, + 0x95e6656e, 0x9ce86e63, 0x87fa7374, 0x8ef47879, + 0xb1de495a, 0xb8d04257, 0xa3c25f40, 0xaacc544d, + 0xec41f7da, 0xe54ffcd7, 0xfe5de1c0, 0xf753eacd, + 0xc879dbee, 0xc177d0e3, 0xda65cdf4, 0xd36bc6f9, + 0xa431afb2, 0xad3fa4bf, 0xb62db9a8, 0xbf23b2a5, + 0x80098386, 0x8907888b, 0x9215959c, 0x9b1b9e91, + 0x7ca1470a, 0x75af4c07, 0x6ebd5110, 0x67b35a1d, + 0x58996b3e, 0x51976033, 0x4a857d24, 0x438b7629, + 0x34d11f62, 0x3ddf146f, 0x26cd0978, 0x2fc30275, + 0x10e93356, 0x19e7385b, 0x02f5254c, 0x0bfb2e41, + 0xd79a8c61, 0xde94876c, 0xc5869a7b, 0xcc889176, + 0xf3a2a055, 0xfaacab58, 0xe1beb64f, 0xe8b0bd42, + 0x9fead409, 0x96e4df04, 0x8df6c213, 0x84f8c91e, + 0xbbd2f83d, 0xb2dcf330, 0xa9ceee27, 0xa0c0e52a, + 0x477a3cb1, 0x4e7437bc, 0x55662aab, 0x5c6821a6, + 0x63421085, 0x6a4c1b88, 0x715e069f, 0x78500d92, + 0x0f0a64d9, 0x06046fd4, 0x1d1672c3, 0x141879ce, + 0x2b3248ed, 0x223c43e0, 0x392e5ef7, 0x302055fa, + 0x9aec01b7, 0x93e20aba, 0x88f017ad, 0x81fe1ca0, + 0xbed42d83, 0xb7da268e, 0xacc83b99, 0xa5c63094, + 0xd29c59df, 0xdb9252d2, 0xc0804fc5, 0xc98e44c8, + 0xf6a475eb, 0xffaa7ee6, 0xe4b863f1, 0xedb668fc, + 0x0a0cb167, 0x0302ba6a, 0x1810a77d, 0x111eac70, + 0x2e349d53, 0x273a965e, 0x3c288b49, 0x35268044, + 0x427ce90f, 0x4b72e202, 0x5060ff15, 0x596ef418, + 0x6644c53b, 0x6f4ace36, 0x7458d321, 0x7d56d82c, + 0xa1377a0c, 0xa8397101, 0xb32b6c16, 0xba25671b, + 0x850f5638, 0x8c015d35, 0x97134022, 0x9e1d4b2f, + 0xe9472264, 0xe0492969, 0xfb5b347e, 0xf2553f73, + 0xcd7f0e50, 0xc471055d, 0xdf63184a, 0xd66d1347, + 0x31d7cadc, 0x38d9c1d1, 0x23cbdcc6, 0x2ac5d7cb, + 0x15efe6e8, 0x1ce1ede5, 0x07f3f0f2, 0x0efdfbff, + 0x79a792b4, 0x70a999b9, 0x6bbb84ae, 0x62b58fa3, + 0x5d9fbe80, 0x5491b58d, 0x4f83a89a, 0x468da397 + }, + { + 0x00000000, 0x0e0b0d09, 0x1c161a12, 0x121d171b, + 0x382c3424, 0x3627392d, 0x243a2e36, 0x2a31233f, + 0x70586848, 0x7e536541, 0x6c4e725a, 0x62457f53, + 0x48745c6c, 0x467f5165, 0x5462467e, 0x5a694b77, + 0xe0b0d090, 0xeebbdd99, 0xfca6ca82, 0xf2adc78b, + 0xd89ce4b4, 0xd697e9bd, 0xc48afea6, 0xca81f3af, + 0x90e8b8d8, 0x9ee3b5d1, 0x8cfea2ca, 0x82f5afc3, + 0xa8c48cfc, 0xa6cf81f5, 0xb4d296ee, 0xbad99be7, + 0xdb7bbb3b, 0xd570b632, 0xc76da129, 0xc966ac20, + 0xe3578f1f, 0xed5c8216, 0xff41950d, 0xf14a9804, + 0xab23d373, 0xa528de7a, 0xb735c961, 0xb93ec468, + 0x930fe757, 0x9d04ea5e, 0x8f19fd45, 0x8112f04c, + 0x3bcb6bab, 0x35c066a2, 0x27dd71b9, 0x29d67cb0, + 0x03e75f8f, 0x0dec5286, 0x1ff1459d, 0x11fa4894, + 0x4b9303e3, 0x45980eea, 0x578519f1, 0x598e14f8, + 0x73bf37c7, 0x7db43ace, 0x6fa92dd5, 0x61a220dc, + 0xadf66d76, 0xa3fd607f, 0xb1e07764, 0xbfeb7a6d, + 0x95da5952, 0x9bd1545b, 0x89cc4340, 0x87c74e49, + 0xddae053e, 0xd3a50837, 0xc1b81f2c, 0xcfb31225, + 0xe582311a, 0xeb893c13, 0xf9942b08, 0xf79f2601, + 0x4d46bde6, 0x434db0ef, 0x5150a7f4, 0x5f5baafd, + 0x756a89c2, 0x7b6184cb, 0x697c93d0, 0x67779ed9, + 0x3d1ed5ae, 0x3315d8a7, 0x2108cfbc, 0x2f03c2b5, + 0x0532e18a, 0x0b39ec83, 0x1924fb98, 0x172ff691, + 0x768dd64d, 0x7886db44, 0x6a9bcc5f, 0x6490c156, + 0x4ea1e269, 0x40aaef60, 0x52b7f87b, 0x5cbcf572, + 0x06d5be05, 0x08deb30c, 0x1ac3a417, 0x14c8a91e, + 0x3ef98a21, 0x30f28728, 0x22ef9033, 0x2ce49d3a, + 0x963d06dd, 0x98360bd4, 0x8a2b1ccf, 0x842011c6, + 0xae1132f9, 0xa01a3ff0, 0xb20728eb, 0xbc0c25e2, + 0xe6656e95, 0xe86e639c, 0xfa737487, 0xf478798e, + 0xde495ab1, 0xd04257b8, 0xc25f40a3, 0xcc544daa, + 0x41f7daec, 0x4ffcd7e5, 0x5de1c0fe, 0x53eacdf7, + 0x79dbeec8, 0x77d0e3c1, 0x65cdf4da, 0x6bc6f9d3, + 0x31afb2a4, 0x3fa4bfad, 0x2db9a8b6, 0x23b2a5bf, + 0x09838680, 0x07888b89, 0x15959c92, 0x1b9e919b, + 0xa1470a7c, 0xaf4c0775, 0xbd51106e, 0xb35a1d67, + 0x996b3e58, 0x97603351, 0x857d244a, 0x8b762943, + 0xd11f6234, 0xdf146f3d, 0xcd097826, 0xc302752f, + 0xe9335610, 0xe7385b19, 0xf5254c02, 0xfb2e410b, + 0x9a8c61d7, 0x94876cde, 0x869a7bc5, 0x889176cc, + 0xa2a055f3, 0xacab58fa, 0xbeb64fe1, 0xb0bd42e8, + 0xead4099f, 0xe4df0496, 0xf6c2138d, 0xf8c91e84, + 0xd2f83dbb, 0xdcf330b2, 0xceee27a9, 0xc0e52aa0, + 0x7a3cb147, 0x7437bc4e, 0x662aab55, 0x6821a65c, + 0x42108563, 0x4c1b886a, 0x5e069f71, 0x500d9278, + 0x0a64d90f, 0x046fd406, 0x1672c31d, 0x1879ce14, + 0x3248ed2b, 0x3c43e022, 0x2e5ef739, 0x2055fa30, + 0xec01b79a, 0xe20aba93, 0xf017ad88, 0xfe1ca081, + 0xd42d83be, 0xda268eb7, 0xc83b99ac, 0xc63094a5, + 0x9c59dfd2, 0x9252d2db, 0x804fc5c0, 0x8e44c8c9, + 0xa475ebf6, 0xaa7ee6ff, 0xb863f1e4, 0xb668fced, + 0x0cb1670a, 0x02ba6a03, 0x10a77d18, 0x1eac7011, + 0x349d532e, 0x3a965e27, 0x288b493c, 0x26804435, + 0x7ce90f42, 0x72e2024b, 0x60ff1550, 0x6ef41859, + 0x44c53b66, 0x4ace366f, 0x58d32174, 0x56d82c7d, + 0x377a0ca1, 0x397101a8, 0x2b6c16b3, 0x25671bba, + 0x0f563885, 0x015d358c, 0x13402297, 0x1d4b2f9e, + 0x472264e9, 0x492969e0, 0x5b347efb, 0x553f73f2, + 0x7f0e50cd, 0x71055dc4, 0x63184adf, 0x6d1347d6, + 0xd7cadc31, 0xd9c1d138, 0xcbdcc623, 0xc5d7cb2a, + 0xefe6e815, 0xe1ede51c, 0xf3f0f207, 0xfdfbff0e, + 0xa792b479, 0xa999b970, 0xbb84ae6b, 0xb58fa362, + 0x9fbe805d, 0x91b58d54, 0x83a89a4f, 0x8da39746 + } +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _AESTAB2_H */ diff --git a/module/icp/asm-x86_64/modes/gcm_intel.S b/module/icp/asm-x86_64/modes/gcm_intel.S new file mode 100644 index 000000000..9bb40bf23 --- /dev/null +++ b/module/icp/asm-x86_64/modes/gcm_intel.S @@ -0,0 +1,334 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009 Intel Corporation + * All Rights Reserved. + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains an accelerated + * Galois Field Multiplication implementation. + * + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, + * carry-less multiplication. More information about PCLMULQDQ can be + * found at: + * http://software.intel.com/en-us/articles/ + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as file galois_hash_asm.c from + * Intel Corporation dated September 21, 2009. + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function + * definition for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Removed code to perform hashing. This is already done with C macro + * GHASH in gcm.c. For better performance, this removed code should be + * reintegrated in the future to replace the C GHASH macro. + * + * 5. Added code to byte swap 16-byte input and output. + * + * 6. Folded in comments from the original C source with embedded assembly + * (SB_w_shift_xor.c) + * + * 7. Renamed function and reordered parameters to match OpenSolaris: + * Intel interface: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * OpenSolaris OS interface: + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) + +#include <sys/types.h> + +/* ARGSUSED */ +void +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { +} + +#else /* lint */ + +#define _ASM +#include <sys/asm_linkage.h> + +#ifdef _KERNEL + /* + * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is, + * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it + * uses it to pass P2 to syscall. + * This also occurs with the STTS macro, but we dont care if + * P2 (%rsi) is modified just before function exit. + * The CLTS and STTS macros push and pop P1 (%rdi) already. + */ +#ifdef __xpv +#define PROTECTED_CLTS \ + push %rsi; \ + CLTS; \ + pop %rsi +#else +#define PROTECTED_CLTS \ + CLTS +#endif /* __xpv */ + + /* + * If CR0_TS is not set, align stack (with push %rbp) and push + * %xmm0 - %xmm10 on stack, otherwise clear CR0_TS + */ +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) \ + push %rbp; \ + mov %rsp, %rbp; \ + movq %cr0, tmpreg; \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + and $-XMM_ALIGN, %rsp; \ + sub $[XMM_SIZE * 11], %rsp; \ + movaps %xmm0, 160(%rsp); \ + movaps %xmm1, 144(%rsp); \ + movaps %xmm2, 128(%rsp); \ + movaps %xmm3, 112(%rsp); \ + movaps %xmm4, 96(%rsp); \ + movaps %xmm5, 80(%rsp); \ + movaps %xmm6, 64(%rsp); \ + movaps %xmm7, 48(%rsp); \ + movaps %xmm8, 32(%rsp); \ + movaps %xmm9, 16(%rsp); \ + movaps %xmm10, (%rsp); \ + jmp 2f; \ +1: \ + PROTECTED_CLTS; \ +2: + + + /* + * If CR0_TS was not set above, pop %xmm0 - %xmm10 off stack, + * otherwise set CR0_TS. + */ +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) \ + testq $CR0_TS, tmpreg; \ + jnz 1f; \ + movaps (%rsp), %xmm10; \ + movaps 16(%rsp), %xmm9; \ + movaps 32(%rsp), %xmm8; \ + movaps 48(%rsp), %xmm7; \ + movaps 64(%rsp), %xmm6; \ + movaps 80(%rsp), %xmm5; \ + movaps 96(%rsp), %xmm4; \ + movaps 112(%rsp), %xmm3; \ + movaps 128(%rsp), %xmm2; \ + movaps 144(%rsp), %xmm1; \ + movaps 160(%rsp), %xmm0; \ + jmp 2f; \ +1: \ + STTS(tmpreg); \ +2: \ + mov %rbp, %rsp; \ + pop %rbp + + +#else +#define PROTECTED_CLTS +#define CLEAR_TS_OR_PUSH_XMM_REGISTERS(tmpreg) +#define SET_TS_OR_POP_XMM_REGISTERS(tmpreg) +#endif /* _KERNEL */ + +/* + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction + */ + +// static uint8_t byte_swap16_mask[] = { +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; +.text +.align XMM_ALIGN +.Lbyte_swap16_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + + +/* + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on P1 and P2 and place the result in P3. + * + * Byte swap the input and the output. + * + * Note: x_in, y, and res all point to a block of 20-byte numbers + * (an array of two 64-bit integers). + * + * Note2: For kernel code, caller is responsible for ensuring + * kpreempt_disable() has been called. This is because %xmm registers are + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, + * respectively, if TS is set on entry. Otherwise, if TS is not set, + * save and restore %xmm registers on the stack. + * + * Note3: Original Intel definition: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * + * Note4: Register/parameter mapping: + * Intel: + * Parameter 1: %rcx (copied to %xmm0) hk or x_in + * Parameter 2: %rdx (copied to %xmm1) s or y + * Parameter 3: %rdi (result) d or res + * OpenSolaris: + * Parameter 1: %rdi (copied to %xmm0) x_in + * Parameter 2: %rsi (copied to %xmm1) y + * Parameter 3: %rdx (result) res + */ + +ENTRY_NP(gcm_mul_pclmulqdq) + CLEAR_TS_OR_PUSH_XMM_REGISTERS(%r10) + + // + // Copy Parameters + // + movdqu (%rdi), %xmm0 // P1 + movdqu (%rsi), %xmm1 // P2 + + // + // Byte swap 16-byte input + // + lea .Lbyte_swap16_mask(%rip), %rax + movaps (%rax), %xmm10 + pshufb %xmm10, %xmm0 + pshufb %xmm10, %xmm1 + + + // + // Multiply with the hash key + // + movdqu %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 + + movdqu %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 + + movdqu %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 + movdqu %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 + + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 + + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + movdqu %xmm3, %xmm7 + movdqu %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqu %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + movdqu %xmm3, %xmm7 + movdqu %xmm3, %xmm8 + movdqu %xmm3, %xmm9 + pslld $31, %xmm7 // packed right shift shifting << 31 + pslld $30, %xmm8 // packed right shift shifting << 30 + pslld $25, %xmm9 // packed right shift shifting << 25 + pxor %xmm8, %xmm7 // xor the shifted versions + pxor %xmm9, %xmm7 + movdqu %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + movdqu %xmm3, %xmm2 + movdqu %xmm3, %xmm4 // packed left shifting >> 1 + movdqu %xmm3, %xmm5 + psrld $1, %xmm2 + psrld $2, %xmm4 // packed left shifting >> 2 + psrld $7, %xmm5 // packed left shifting >> 7 + pxor %xmm4, %xmm2 // xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 // the result is in xmm6 + + // + // Byte swap 16-byte result + // + pshufb %xmm10, %xmm6 // %xmm10 has the swap mask + + // + // Store the result + // + movdqu %xmm6, (%rdx) // P3 + + + // + // Cleanup and Return + // + SET_TS_OR_POP_XMM_REGISTERS(%r10) + ret + SET_SIZE(gcm_mul_pclmulqdq) + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/sha1/sha1-x86_64.S new file mode 100644 index 000000000..53cc156a7 --- /dev/null +++ b/module/icp/asm-x86_64/sha1/sha1-x86_64.S @@ -0,0 +1,1346 @@ +/* + * !/usr/bin/env perl + * + * ==================================================================== + * Written by Andy Polyakov <[email protected]> for the OpenSSL + * project. The module is, however, dual licensed under OpenSSL and + * CRYPTOGAMS licenses depending on where you obtain it. For further + * details see http://www.openssl.org/~appro/cryptogams/. + * ==================================================================== + * + * sha1_block procedure for x86_64. + * + * It was brought to my attention that on EM64T compiler-generated code + * was far behind 32-bit assembler implementation. This is unlike on + * Opteron where compiler-generated code was only 15% behind 32-bit + * assembler, which originally made it hard to motivate the effort. + * There was suggestion to mechanically translate 32-bit code, but I + * dismissed it, reasoning that x86_64 offers enough register bank + * capacity to fully utilize SHA-1 parallelism. Therefore this fresh + * implementation:-) However! While 64-bit code does performs better + * on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, + * x86_64 does offer larger *addressable* bank, but out-of-order core + * reaches for even more registers through dynamic aliasing, and EM64T + * core must have managed to run-time optimize even 32-bit code just as + * good as 64-bit one. Performance improvement is summarized in the + * following table: + * + * gcc 3.4 32-bit asm cycles/byte + * Opteron +45% +20% 6.8 + * Xeon P4 +65% +0% 9.9 + * Core2 +60% +10% 7.0 + * + * + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha1-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). + * + */ + +/* + * This file was generated by a perl script (sha1-x86_64.pl). The comments from + * the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include <sys/stdint.h> +#include <sys/sha1.h> + +/* ARGSUSED */ +void +sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks) +{ +} + +#else +#define _ASM +#include <sys/asm_linkage.h> +ENTRY_NP(sha1_block_data_order) + push %rbx + push %rbp + push %r12 + mov %rsp,%rax + mov %rdi,%r8 # reassigned argument + sub $72,%rsp + mov %rsi,%r9 # reassigned argument + and $-64,%rsp + mov %rdx,%r10 # reassigned argument + mov %rax,64(%rsp) + + mov 0(%r8),%edx + mov 4(%r8),%esi + mov 8(%r8),%edi + mov 12(%r8),%ebp + mov 16(%r8),%r11d +.align 4 +.Lloop: + mov 0(%r9),%eax + bswap %eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 4(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,4(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 8(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,8(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 12(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,12(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 16(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,16(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 20(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,20(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 24(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,24(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 28(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,28(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 32(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,32(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 36(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,36(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 40(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,40(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 44(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,44(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 48(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,48(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 52(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,52(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 56(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,56(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 60(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,60(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%esi + xor 32(%rsp),%eax + and %r11d,%ebx + add %esi,%edi + xor 52(%rsp),%eax + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edx + xor 36(%rsp),%eax + and %ebp,%ebx + add %edx,%esi + xor 56(%rsp),%eax + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea 0x5a827999(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + and %edi,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + and %esi,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea 0x5a827999(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + and %edx,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,52(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,56(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,60(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 0(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 32(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 52(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,0(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 4(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 36(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 56(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,4(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 8(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 40(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 60(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,8(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 12(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 44(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 0(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,12(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 16(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 48(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 4(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 20(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 28(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 52(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 8(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 24(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 32(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 56(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 12(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 28(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 36(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 60(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 16(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 32(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 40(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 0(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 20(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,32(%rsp) + lea -0x70e44324(%eax,%edx),%esi + mov 36(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 44(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 4(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 24(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,36(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 40(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 48(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 8(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 28(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,40(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 44(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 52(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 12(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 32(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,44(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 48(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 56(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 16(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 36(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,48(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 52(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 60(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 20(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 40(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,52(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 56(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 0(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 24(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 44(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,56(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 60(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 4(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 28(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 48(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,60(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 0(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 8(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 32(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 52(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,0(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 4(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 12(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 36(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 56(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,4(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 8(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 16(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 40(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 60(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,8(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 12(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 20(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 44(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 0(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,12(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 16(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 24(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 48(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 4(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,16(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 20(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 28(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 52(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 8(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,20(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 24(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 32(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 56(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 12(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,24(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 28(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 36(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 60(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 16(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,28(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 32(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 40(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 0(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 20(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,32(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 36(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 44(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 4(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 24(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,36(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 40(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 48(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 8(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 28(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,40(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 44(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 52(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 12(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 32(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,44(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 48(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 56(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 16(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 36(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,48(%rsp) + add %ebx,%edx + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 52(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 60(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 40(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,52(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 56(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 0(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 44(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,56(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 60(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 4(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 28(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 48(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,60(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 32(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 52(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 36(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 56(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + lea -0x359d3e2a(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + lea -0x359d3e2a(%eax,%ebp),%r11d + mov %esi,%ebx + mov %r12d,%ebp + xor %edx,%ebx + rol $5,%ebp + xor %edi,%ebx + add %ebp,%r11d + rol $30,%edx + add %ebx,%r11d + // Update and save state information in SHA-1 context + add 0(%r8),%r11d + add 4(%r8),%r12d + add 8(%r8),%edx + add 12(%r8),%esi + add 16(%r8),%edi + mov %r11d,0(%r8) + mov %r12d,4(%r8) + mov %edx,8(%r8) + mov %esi,12(%r8) + mov %edi,16(%r8) + + xchg %r11d,%edx # mov %r11d,%edx + xchg %r12d,%esi # mov %r12d,%esi + xchg %r11d,%edi # mov %edx,%edi + xchg %r12d,%ebp # mov %esi,%ebp + # mov %edi,%r11d + lea 64(%r9),%r9 + sub $1,%r10 + jnz .Lloop + mov 64(%rsp),%rsp + pop %r12 + pop %rbp + pop %rbx + ret +SET_SIZE(sha1_block_data_order) +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <[email protected]>" + +#endif /* lint || __lint */ diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S new file mode 100644 index 000000000..b6a9bbc86 --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -0,0 +1,2060 @@ +/* + * ==================================================================== + * Written by Andy Polyakov <[email protected]> for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that could + * be used to generate sha256 and sha512 variants from the same code base. + * For our purposes, we only need sha256 and so getting the perl script to + * run as part of the build process seemed superfluous. The comments from + * the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include <sys/stdint.h> +#include <sha2/sha2.h> + +/* ARGSUSED */ +void +SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include <sys/asm_linkage.h> + +ENTRY_NP(SHA256TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*4+4*8,%rsp + lea (%rsi,%rdx,4),%rdx # inp+num*16*4 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*4+3*8(%rsp) # save copy of %rsp + + /.picmeup %rbp + / The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + / the address of the "next" instruction into the target register + / (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + /nop / .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K256-.(%rbp),%rbp + + mov 4*0(%rdi),%eax + mov 4*1(%rdi),%ebx + mov 4*2(%rdi),%ecx + mov 4*3(%rdi),%edx + mov 4*4(%rdi),%r8d + mov 4*5(%rdi),%r9d + mov 4*6(%rdi),%r10d + mov 4*7(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + xor %rdi,%rdi + mov 4*0(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*1(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*2(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*3(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*4(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*5(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*6(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*7(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 4*8(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*9(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*10(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*11(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*12(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*13(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*14(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*15(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + mov 4(%rsp),%r13d + mov 56(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 36(%rsp),%r12d + + add 0(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 8(%rsp),%r13d + mov 60(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 40(%rsp),%r12d + + add 4(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 12(%rsp),%r13d + mov 0(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 44(%rsp),%r12d + + add 8(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 16(%rsp),%r13d + mov 4(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 48(%rsp),%r12d + + add 12(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 20(%rsp),%r13d + mov 8(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 52(%rsp),%r12d + + add 16(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 24(%rsp),%r13d + mov 12(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 56(%rsp),%r12d + + add 20(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 28(%rsp),%r13d + mov 16(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 60(%rsp),%r12d + + add 24(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 32(%rsp),%r13d + mov 20(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 0(%rsp),%r12d + + add 28(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 36(%rsp),%r13d + mov 24(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 4(%rsp),%r12d + + add 32(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 40(%rsp),%r13d + mov 28(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 8(%rsp),%r12d + + add 36(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 44(%rsp),%r13d + mov 32(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 12(%rsp),%r12d + + add 40(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 48(%rsp),%r13d + mov 36(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 16(%rsp),%r12d + + add 44(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 52(%rsp),%r13d + mov 40(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 20(%rsp),%r12d + + add 48(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 56(%rsp),%r13d + mov 44(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 24(%rsp),%r12d + + add 52(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 60(%rsp),%r13d + mov 48(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 28(%rsp),%r12d + + add 56(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 0(%rsp),%r13d + mov 52(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 32(%rsp),%r12d + + add 60(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + cmp $64,%rdi + jb .Lrounds_16_xx + + mov 16*4+0*8(%rsp),%rdi + lea 16*4(%rsi),%rsi + + add 4*0(%rdi),%eax + add 4*1(%rdi),%ebx + add 4*2(%rdi),%ecx + add 4*3(%rdi),%edx + add 4*4(%rdi),%r8d + add 4*5(%rdi),%r9d + add 4*6(%rdi),%r10d + add 4*7(%rdi),%r11d + + cmp 16*4+2*8(%rsp),%rsi + + mov %eax,4*0(%rdi) + mov %ebx,4*1(%rdi) + mov %ecx,4*2(%rdi) + mov %edx,4*3(%rdi) + mov %r8d,4*4(%rdi) + mov %r9d,4*5(%rdi) + mov %r10d,4*6(%rdi) + mov %r11d,4*7(%rdi) + jb .Lloop + + mov 16*4+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA256TransformBlocks) + +.align 64 +.type K256,@object +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +#endif /* !lint && !__lint */ |