aboutsummaryrefslogtreecommitdiffstats
path: root/module/icp/asm-x86_64/aes/aes_intel.S
diff options
context:
space:
mode:
Diffstat (limited to 'module/icp/asm-x86_64/aes/aes_intel.S')
-rw-r--r--module/icp/asm-x86_64/aes/aes_intel.S851
1 files changed, 851 insertions, 0 deletions
diff --git a/module/icp/asm-x86_64/aes/aes_intel.S b/module/icp/asm-x86_64/aes/aes_intel.S
new file mode 100644
index 000000000..0b4700f96
--- /dev/null
+++ b/module/icp/asm-x86_64/aes/aes_intel.S
@@ -0,0 +1,851 @@
+/*
+ * ====================================================================
+ * Written by Intel Corporation for the OpenSSL project to add support
+ * for Intel AES-NI instructions. Rights for redistribution and usage
+ * in source and binary forms are granted according to the OpenSSL
+ * license.
+ *
+ * Author: Huang Ying <ying.huang at intel dot com>
+ * Vinodh Gopal <vinodh.gopal at intel dot com>
+ * Kahraman Akdemir
+ *
+ * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD)
+ * instructions that are going to be introduced in the next generation
+ * of Intel processor, as of 2009. These instructions enable fast and
+ * secure data encryption and decryption, using the Advanced Encryption
+ * Standard (AES), defined by FIPS Publication number 197. The
+ * architecture introduces six instructions that offer full hardware
+ * support for AES. Four of them support high performance data
+ * encryption and decryption, and the other two instructions support
+ * the AES key expansion procedure.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ * software must display the following acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ * endorse or promote products derived from this software without
+ * prior written permission. For written permission, please contact
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ * nor may "OpenSSL" appear in their names without prior written
+ * permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ * acknowledgment:
+ * "This product includes software developed by the OpenSSL Project
+ * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ */
+
+/*
+ * ====================================================================
+ * OpenSolaris OS modifications
+ *
+ * This source originates as files aes-intel.S and eng_aesni_asm.pl, in
+ * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by
+ * Huang Ying of Intel to the openssl-dev mailing list under the subject
+ * of "Add support to Intel AES-NI instruction set for x86_64 platform".
+ *
+ * This OpenSolaris version has these major changes from the original source:
+ *
+ * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
+ * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function
+ * definitions for lint.
+ *
+ * 2. Formatted code, added comments, and added #includes and #defines.
+ *
+ * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
+ * calling kpreempt_disable() and kpreempt_enable().
+ * If the TS bit is not set, Save and restore %xmm registers at the beginning
+ * and end of function calls (%xmm* registers are not saved and restored by
+ * during kernel thread preemption).
+ *
+ * 4. Renamed functions, reordered parameters, and changed return value
+ * to match OpenSolaris:
+ *
+ * OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return values for above are non-zero on error, 0 on success.
+ *
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ * typedef struct aes_key_st {
+ * unsigned int rd_key[4 *(AES_MAXNR + 1)];
+ * int rounds;
+ * unsigned int pad[3];
+ * } AES_KEY;
+ * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules
+ * (ks32) instead of 64-bit (ks64).
+ * Number of rounds (aka round count) is at offset 240 of AES_KEY.
+ *
+ * OpenSolaris OS interface (#ifdefs removed for readability):
+ * int rijndael_key_setup_dec_intel(uint32_t rk[],
+ * const uint32_t cipherKey[], uint64_t keyBits);
+ * int rijndael_key_setup_enc_intel(uint32_t rk[],
+ * const uint32_t cipherKey[], uint64_t keyBits);
+ * Return values for above are 0 on error, number of rounds on success.
+ *
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4]);
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4]);
+ * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4];
+ * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t;
+ *
+ * typedef union {
+ * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
+ * } aes_ks_t;
+ * typedef struct aes_key {
+ * aes_ks_t encr_ks, decr_ks;
+ * long double align128;
+ * int flags, nr, type;
+ * } aes_key_t;
+ *
+ * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text,
+ * ct is crypto text, and MAX_AES_NR is 14.
+ * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64.
+ *
+ * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary.
+ *
+ * ====================================================================
+ */
+
+#if defined(lint) || defined(__lint)
+
+#include <sys/types.h>
+
+/* ARGSUSED */
+void
+aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4],
+ uint32_t ct[4]) {
+}
+/* ARGSUSED */
+void
+aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ uint32_t pt[4]) {
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ uint64_t keyBits) {
+ return (0);
+}
+/* ARGSUSED */
+int
+rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ uint64_t keyBits) {
+ return (0);
+}
+
+
+#else /* lint */
+
+#define _ASM
+#include <sys/asm_linkage.h>
+
+#ifdef _KERNEL
+ /*
+ * Note: the CLTS macro clobbers P2 (%rsi) under i86xpv. That is,
+ * it calls HYPERVISOR_fpu_taskswitch() which modifies %rsi when it
+ * uses it to pass P2 to syscall.
+ * This also occurs with the STTS macro, but we dont care if
+ * P2 (%rsi) is modified just before function exit.
+ * The CLTS and STTS macros push and pop P1 (%rdi) already.
+ */
+#ifdef __xpv
+#define PROTECTED_CLTS \
+ push %rsi; \
+ CLTS; \
+ pop %rsi
+#else
+#define PROTECTED_CLTS \
+ CLTS
+#endif /* __xpv */
+
+#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg) \
+ push %rbp; \
+ mov %rsp, %rbp; \
+ movq %cr0, tmpreg; \
+ testq $CR0_TS, tmpreg; \
+ jnz 1f; \
+ and $-XMM_ALIGN, %rsp; \
+ sub $[XMM_SIZE * 2], %rsp; \
+ movaps %xmm0, 16(%rsp); \
+ movaps %xmm1, (%rsp); \
+ jmp 2f; \
+1: \
+ PROTECTED_CLTS; \
+2:
+
+ /*
+ * If CR0_TS was not set above, pop %xmm0 and %xmm1 off stack,
+ * otherwise set CR0_TS.
+ */
+#define SET_TS_OR_POP_XMM0_XMM1(tmpreg) \
+ testq $CR0_TS, tmpreg; \
+ jnz 1f; \
+ movaps (%rsp), %xmm1; \
+ movaps 16(%rsp), %xmm0; \
+ jmp 2f; \
+1: \
+ STTS(tmpreg); \
+2: \
+ mov %rbp, %rsp; \
+ pop %rbp
+
+ /*
+ * If CR0_TS is not set, align stack (with push %rbp) and push
+ * %xmm0 - %xmm6 on stack, otherwise clear CR0_TS
+ */
+#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg) \
+ push %rbp; \
+ mov %rsp, %rbp; \
+ movq %cr0, tmpreg; \
+ testq $CR0_TS, tmpreg; \
+ jnz 1f; \
+ and $-XMM_ALIGN, %rsp; \
+ sub $[XMM_SIZE * 7], %rsp; \
+ movaps %xmm0, 96(%rsp); \
+ movaps %xmm1, 80(%rsp); \
+ movaps %xmm2, 64(%rsp); \
+ movaps %xmm3, 48(%rsp); \
+ movaps %xmm4, 32(%rsp); \
+ movaps %xmm5, 16(%rsp); \
+ movaps %xmm6, (%rsp); \
+ jmp 2f; \
+1: \
+ PROTECTED_CLTS; \
+2:
+
+
+ /*
+ * If CR0_TS was not set above, pop %xmm0 - %xmm6 off stack,
+ * otherwise set CR0_TS.
+ */
+#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg) \
+ testq $CR0_TS, tmpreg; \
+ jnz 1f; \
+ movaps (%rsp), %xmm6; \
+ movaps 16(%rsp), %xmm5; \
+ movaps 32(%rsp), %xmm4; \
+ movaps 48(%rsp), %xmm3; \
+ movaps 64(%rsp), %xmm2; \
+ movaps 80(%rsp), %xmm1; \
+ movaps 96(%rsp), %xmm0; \
+ jmp 2f; \
+1: \
+ STTS(tmpreg); \
+2: \
+ mov %rbp, %rsp; \
+ pop %rbp
+
+
+#else
+#define PROTECTED_CLTS
+#define CLEAR_TS_OR_PUSH_XMM0_XMM1(tmpreg)
+#define SET_TS_OR_POP_XMM0_XMM1(tmpreg)
+#define CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(tmpreg)
+#define SET_TS_OR_POP_XMM0_TO_XMM6(tmpreg)
+#endif /* _KERNEL */
+
+
+/*
+ * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(),
+ * _key_expansion_256a(), _key_expansion_256b()
+ *
+ * Helper functions called by rijndael_key_setup_inc_intel().
+ * Also used indirectly by rijndael_key_setup_dec_intel().
+ *
+ * Input:
+ * %xmm0 User-provided cipher key
+ * %xmm1 Round constant
+ * Output:
+ * (%rcx) AES key
+ */
+
+.align 16
+_key_expansion_128:
+_key_expansion_256a:
+ pshufd $0b11111111, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+ SET_SIZE(_key_expansion_128)
+ SET_SIZE(_key_expansion_256a)
+
+.align 16
+_key_expansion_192a:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ movaps %xmm2, %xmm6
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, %xmm1
+ shufps $0b01000100, %xmm0, %xmm6
+ movaps %xmm6, (%rcx)
+ shufps $0b01001110, %xmm2, %xmm1
+ movaps %xmm1, 0x10(%rcx)
+ add $0x20, %rcx
+ ret
+ SET_SIZE(_key_expansion_192a)
+
+.align 16
+_key_expansion_192b:
+ pshufd $0b01010101, %xmm1, %xmm1
+ shufps $0b00010000, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ shufps $0b10001100, %xmm0, %xmm4
+ pxor %xmm4, %xmm0
+ pxor %xmm1, %xmm0
+
+ movaps %xmm2, %xmm5
+ pslldq $4, %xmm5
+ pshufd $0b11111111, %xmm0, %xmm3
+ pxor %xmm3, %xmm2
+ pxor %xmm5, %xmm2
+
+ movaps %xmm0, (%rcx)
+ add $0x10, %rcx
+ ret
+ SET_SIZE(_key_expansion_192b)
+
+.align 16
+_key_expansion_256b:
+ pshufd $0b10101010, %xmm1, %xmm1
+ shufps $0b00010000, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ shufps $0b10001100, %xmm2, %xmm4
+ pxor %xmm4, %xmm2
+ pxor %xmm1, %xmm2
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+ ret
+ SET_SIZE(_key_expansion_256b)
+
+
+/*
+ * rijndael_key_setup_enc_intel()
+ * Expand the cipher key into the encryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[],
+ * uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_encrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+
+#ifdef OPENSSL_INTERFACE
+#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key
+#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key
+
+#define USERCIPHERKEY rdi /* P1, 64 bits */
+#define KEYSIZE32 esi /* P2, 32 bits */
+#define KEYSIZE64 rsi /* P2, 64 bits */
+#define AESKEY rdx /* P3, 64 bits */
+
+#else /* OpenSolaris Interface */
+#define AESKEY rdi /* P1, 64 bits */
+#define USERCIPHERKEY rsi /* P2, 64 bits */
+#define KEYSIZE32 edx /* P3, 32 bits */
+#define KEYSIZE64 rdx /* P3, 64 bits */
+#endif /* OPENSSL_INTERFACE */
+
+#define ROUNDS32 KEYSIZE32 /* temp */
+#define ROUNDS64 KEYSIZE64 /* temp */
+#define ENDAESKEY USERCIPHERKEY /* temp */
+
+ENTRY_NP(rijndael_key_setup_enc_intel)
+rijndael_key_setup_enc_intel_local:
+ CLEAR_TS_OR_PUSH_XMM0_TO_XMM6(%r10)
+
+ // NULL pointer sanity check
+ test %USERCIPHERKEY, %USERCIPHERKEY
+ jz .Lenc_key_invalid_param
+ test %AESKEY, %AESKEY
+ jz .Lenc_key_invalid_param
+
+ movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes)
+ movaps %xmm0, (%AESKEY)
+ lea 0x10(%AESKEY), %rcx // key addr
+ pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x
+
+ cmp $256, %KEYSIZE32
+ jnz .Lenc_key192
+
+ // AES 256: 14 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $14, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14
+#endif /* OPENSSL_INTERFACE */
+
+ movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes)
+ movaps %xmm2, (%rcx)
+ add $0x10, %rcx
+
+ aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+ aeskeygenassist $0x1, %xmm0, %xmm1
+ call _key_expansion_256b
+ aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+ aeskeygenassist $0x2, %xmm0, %xmm1
+ call _key_expansion_256b
+ aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+ aeskeygenassist $0x4, %xmm0, %xmm1
+ call _key_expansion_256b
+ aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+ aeskeygenassist $0x8, %xmm0, %xmm1
+ call _key_expansion_256b
+ aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+ aeskeygenassist $0x10, %xmm0, %xmm1
+ call _key_expansion_256b
+ aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+ aeskeygenassist $0x20, %xmm0, %xmm1
+ call _key_expansion_256b
+ aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key
+ call _key_expansion_256a
+
+ SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* Open Solaris Interface */
+ mov $14, %rax // return # rounds = 14
+#endif
+ ret
+
+.align 4
+.Lenc_key192:
+ cmp $192, %KEYSIZE32
+ jnz .Lenc_key128
+
+ // AES 192: 12 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $12, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12
+#endif /* OPENSSL_INTERFACE */
+
+ movq 0x10(%USERCIPHERKEY), %xmm2 // other user key
+ aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a
+ aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b
+ aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a
+ aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b
+ aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a
+ aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b
+ aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192a
+ aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key
+ call _key_expansion_192b
+
+ SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* OpenSolaris Interface */
+ mov $12, %rax // return # rounds = 12
+#endif
+ ret
+
+.align 4
+.Lenc_key128:
+ cmp $128, %KEYSIZE32
+ jnz .Lenc_key_invalid_key_bits
+
+ // AES 128: 10 rounds in encryption key schedule
+#ifdef OPENSSL_INTERFACE
+ mov $10, %ROUNDS32
+ movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10
+#endif /* OPENSSL_INTERFACE */
+
+ aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+ aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key
+ call _key_expansion_128
+
+ SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
+#ifdef OPENSSL_INTERFACE
+ xor %rax, %rax // return 0 (OK)
+#else /* OpenSolaris Interface */
+ mov $10, %rax // return # rounds = 10
+#endif
+ ret
+
+.Lenc_key_invalid_param:
+#ifdef OPENSSL_INTERFACE
+ SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
+ mov $-1, %rax // user key or AES key pointer is NULL
+ ret
+#else
+ /* FALLTHROUGH */
+#endif /* OPENSSL_INTERFACE */
+
+.Lenc_key_invalid_key_bits:
+ SET_TS_OR_POP_XMM0_TO_XMM6(%r10)
+#ifdef OPENSSL_INTERFACE
+ mov $-2, %rax // keysize is invalid
+#else /* Open Solaris Interface */
+ xor %rax, %rax // a key pointer is NULL or invalid keysize
+#endif /* OPENSSL_INTERFACE */
+
+ ret
+ SET_SIZE(rijndael_key_setup_enc_intel)
+
+
+/*
+ * rijndael_key_setup_dec_intel()
+ * Expand the cipher key into the decryption key schedule.
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * OpenSolaris interface:
+ * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[],
+ * uint64_t keyBits);
+ * Return value is 0 on error, number of rounds on success.
+ * P1->P2, P2->P3, P3->P1
+ *
+ * Original Intel OpenSSL interface:
+ * int intel_AES_set_decrypt_key(const unsigned char *userKey,
+ * const int bits, AES_KEY *key);
+ * Return value is non-zero on error, 0 on success.
+ */
+ENTRY_NP(rijndael_key_setup_dec_intel)
+ // Generate round keys used for encryption
+ call rijndael_key_setup_enc_intel_local
+ test %rax, %rax
+#ifdef OPENSSL_INTERFACE
+ jnz .Ldec_key_exit // Failed if returned non-0
+#else /* OpenSolaris Interface */
+ jz .Ldec_key_exit // Failed if returned 0
+#endif /* OPENSSL_INTERFACE */
+
+ CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
+
+ /*
+ * Convert round keys used for encryption
+ * to a form usable for decryption
+ */
+#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */
+ mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14)
+ // (already set for OpenSSL)
+#endif
+
+ lea 0x10(%AESKEY), %rcx // key addr
+ shl $4, %ROUNDS32
+ add %AESKEY, %ROUNDS64
+ mov %ROUNDS64, %ENDAESKEY
+
+.align 4
+.Ldec_key_reorder_loop:
+ movaps (%AESKEY), %xmm0
+ movaps (%ROUNDS64), %xmm1
+ movaps %xmm0, (%ROUNDS64)
+ movaps %xmm1, (%AESKEY)
+ lea 0x10(%AESKEY), %AESKEY
+ lea -0x10(%ROUNDS64), %ROUNDS64
+ cmp %AESKEY, %ROUNDS64
+ ja .Ldec_key_reorder_loop
+
+.align 4
+.Ldec_key_inv_loop:
+ movaps (%rcx), %xmm0
+ // Convert an encryption round key to a form usable for decryption
+ // with the "AES Inverse Mix Columns" instruction
+ aesimc %xmm0, %xmm1
+ movaps %xmm1, (%rcx)
+ lea 0x10(%rcx), %rcx
+ cmp %ENDAESKEY, %rcx
+ jnz .Ldec_key_inv_loop
+
+ SET_TS_OR_POP_XMM0_XMM1(%r10)
+
+.Ldec_key_exit:
+ // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error
+ // OpenSSL: rax = 0 for OK, or non-zero for error
+ ret
+ SET_SIZE(rijndael_key_setup_dec_intel)
+
+
+/*
+ * aes_encrypt_intel()
+ * Encrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0 State
+ * %xmm1 Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_encrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_encrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key)
+ */
+
+#ifdef OPENSSL_INTERFACE
+#define aes_encrypt_intel intel_AES_encrypt
+#define aes_decrypt_intel intel_AES_decrypt
+
+#define INP rdi /* P1, 64 bits */
+#define OUTP rsi /* P2, 64 bits */
+#define KEYP rdx /* P3, 64 bits */
+
+/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */
+#define NROUNDS32 ecx /* temporary, 32 bits */
+#define NROUNDS cl /* temporary, 8 bits */
+
+#else /* OpenSolaris Interface */
+#define KEYP rdi /* P1, 64 bits */
+#define NROUNDS esi /* P2, 32 bits */
+#define INP rdx /* P3, 64 bits */
+#define OUTP rcx /* P4, 64 bits */
+#endif /* OPENSSL_INTERFACE */
+
+#define STATE xmm0 /* temporary, 128 bits */
+#define KEY xmm1 /* temporary, 128 bits */
+
+ENTRY_NP(aes_encrypt_intel)
+ CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
+
+ movups (%INP), %STATE // input
+ movaps (%KEYP), %KEY // key
+#ifdef OPENSSL_INTERFACE
+ mov 240(%KEYP), %NROUNDS32 // round count
+#else /* OpenSolaris Interface */
+ /* Round count is already present as P2 in %rsi/%esi */
+#endif /* OPENSSL_INTERFACE */
+
+ pxor %KEY, %STATE // round 0
+ lea 0x30(%KEYP), %KEYP
+ cmp $12, %NROUNDS
+ jb .Lenc128
+ lea 0x20(%KEYP), %KEYP
+ je .Lenc192
+
+ // AES 256
+ lea 0x20(%KEYP), %KEYP
+ movaps -0x60(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps -0x50(%KEYP), %KEY
+ aesenc %KEY, %STATE
+
+.align 4
+.Lenc192:
+ // AES 192 and 256
+ movaps -0x40(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps -0x30(%KEYP), %KEY
+ aesenc %KEY, %STATE
+
+.align 4
+.Lenc128:
+ // AES 128, 192, and 256
+ movaps -0x20(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps -0x10(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps (%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x10(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x20(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x30(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x40(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x50(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x60(%KEYP), %KEY
+ aesenc %KEY, %STATE
+ movaps 0x70(%KEYP), %KEY
+ aesenclast %KEY, %STATE // last round
+ movups %STATE, (%OUTP) // output
+
+ SET_TS_OR_POP_XMM0_XMM1(%r10)
+ ret
+ SET_SIZE(aes_encrypt_intel)
+
+
+/*
+ * aes_decrypt_intel()
+ * Decrypt a single block (in and out can overlap).
+ *
+ * For kernel code, caller is responsible for ensuring kpreempt_disable()
+ * has been called. This is because %xmm registers are not saved/restored.
+ * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set
+ * on entry. Otherwise, if TS is not set, save and restore %xmm registers
+ * on the stack.
+ *
+ * Temporary register usage:
+ * %xmm0 State
+ * %xmm1 Key
+ *
+ * Original OpenSolaris Interface:
+ * void aes_decrypt_intel(const aes_ks_t *ks, int Nr,
+ * const uint32_t pt[4], uint32_t ct[4])/
+ *
+ * Original Intel OpenSSL Interface:
+ * void intel_AES_decrypt(const unsigned char *in, unsigned char *out,
+ * const AES_KEY *key);
+ */
+ENTRY_NP(aes_decrypt_intel)
+ CLEAR_TS_OR_PUSH_XMM0_XMM1(%r10)
+
+ movups (%INP), %STATE // input
+ movaps (%KEYP), %KEY // key
+#ifdef OPENSSL_INTERFACE
+ mov 240(%KEYP), %NROUNDS32 // round count
+#else /* OpenSolaris Interface */
+ /* Round count is already present as P2 in %rsi/%esi */
+#endif /* OPENSSL_INTERFACE */
+
+ pxor %KEY, %STATE // round 0
+ lea 0x30(%KEYP), %KEYP
+ cmp $12, %NROUNDS
+ jb .Ldec128
+ lea 0x20(%KEYP), %KEYP
+ je .Ldec192
+
+ // AES 256
+ lea 0x20(%KEYP), %KEYP
+ movaps -0x60(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps -0x50(%KEYP), %KEY
+ aesdec %KEY, %STATE
+
+.align 4
+.Ldec192:
+ // AES 192 and 256
+ movaps -0x40(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps -0x30(%KEYP), %KEY
+ aesdec %KEY, %STATE
+
+.align 4
+.Ldec128:
+ // AES 128, 192, and 256
+ movaps -0x20(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps -0x10(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps (%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x10(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x20(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x30(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x40(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x50(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x60(%KEYP), %KEY
+ aesdec %KEY, %STATE
+ movaps 0x70(%KEYP), %KEY
+ aesdeclast %KEY, %STATE // last round
+ movups %STATE, (%OUTP) // output
+
+ SET_TS_OR_POP_XMM0_XMM1(%r10)
+ ret
+ SET_SIZE(aes_decrypt_intel)
+
+#endif /* lint || __lint */