diff options
Diffstat (limited to 'module/icp/asm-x86_64/modes')
6 files changed, 1821 insertions, 0 deletions
diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams new file mode 100644 index 000000000..0de1883dc --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams @@ -0,0 +1,36 @@ +Copyright (c) 2006-2017, CRYPTOGAMS by <[email protected]> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain copyright notices, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the CRYPTOGAMS nor the names of its + copyright holder and contributors may be used to endorse or + promote products derived from this software without specific + prior written permission. + +ALTERNATIVELY, provided that this notice is retained in full, this +product may be distributed under the terms of the GNU General Public +License (GPL), in which case the provisions of the GPL apply INSTEAD OF +those given above. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip new file mode 100644 index 000000000..6184759c8 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl new file mode 100644 index 000000000..49cc83d2e --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl @@ -0,0 +1,177 @@ + + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip new file mode 100644 index 000000000..6184759c8 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S new file mode 100644 index 000000000..bad0b7d23 --- /dev/null +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -0,0 +1,892 @@ +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <[email protected]> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, 0.74 - on +# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled +# measurements for favourable packet size, one divisible by 96. +# Applications using the EVP interface will observe a few percent +# worse performance.] +# +# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +# Generated once from +# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl +# and modified for ICP. Modification are kept at a bare minimum to ease later +# upstream merges. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE) + +.text + +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + cmpl $14,%ebp // ICP does not zero key schedule. + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +.align 32 +aesni_gcm_decrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32+32(%r9),%r9 + movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + leaq -192(%rdi,%rdx,1),%r15 + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_dec_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $288,%rdx + jb .Lgcm_enc_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + leaq (%rsi),%r14 + leaq -192(%rsi,%rdx,1),%r15 + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + leaq 32+32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +/* Some utility routines */ + +/* + * clear all fpu registers + * void clear_fpu_regs_avx(void); + */ +.globl clear_fpu_regs_avx +.type clear_fpu_regs_avx,@function +.align 32 +clear_fpu_regs_avx: + vzeroall + ret +.size clear_fpu_regs_avx,.-clear_fpu_regs_avx + +/* + * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); + * + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and + * stores the result at `dst'. The XOR is performed using FPU registers, + * so make sure FPU state is saved when running this in the kernel. + */ +.globl gcm_xor_avx +.type gcm_xor_avx,@function +.align 32 +gcm_xor_avx: + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%rsi) + ret +.size gcm_xor_avx,.-gcm_xor_avx + +/* + * Toggle a boolean_t value atomically and return the new value. + * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + */ +.globl atomic_toggle_boolean_nv +.type atomic_toggle_boolean_nv,@function +.align 32 +atomic_toggle_boolean_nv: + xorl %eax, %eax + lock + xorl $1, (%rdi) + jz 1f + movl $1, %eax +1: + ret +.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv + +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S new file mode 100644 index 000000000..90cc36b43 --- /dev/null +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -0,0 +1,714 @@ +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov <[email protected]> for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. + +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) +# Goldmont 1.08(+24%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. +# +# Knights Landing achieves 1.09 cpb. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + +# Generated once from +# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl +# and modified for ICP. Modification are kept at a bare minimum to ease later +# upstream merges. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) + +.text + +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.cfi_startproc +.L_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 +.byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_gmult_clmul,.-gcm_gmult_clmul + +.globl gcm_init_htab_avx +.type gcm_init_htab_avx,@function +.align 32 +gcm_init_htab_avx: +.cfi_startproc + vzeroupper + + vmovdqu (%rsi),%xmm2 + // KCF/ICP stores H in network byte order with the hi qword first + // so we need to swap all bytes, not the 2 qwords. + vmovdqu .Lbswap_mask(%rip),%xmm4 + vpshufb %xmm4,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_init_htab_avx,.-gcm_init_htab_avx + +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: +.cfi_startproc + jmp .L_gmult_clmul +.cfi_endproc +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: +.cfi_startproc + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_ghash_avx,.-gcm_ghash_avx +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 +.align 64 +.type .Lrem_4bit,@object +.Lrem_4bit: +.long 0,0,0,471859200,0,943718400,0,610271232 +.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 +.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 +.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 +.type .Lrem_8bit,@object +.Lrem_8bit: +.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E +.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E +.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E +.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E +.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E +.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E +.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E +.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E +.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE +.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE +.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE +.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE +.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E +.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E +.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE +.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE +.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E +.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E +.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E +.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E +.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E +.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E +.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E +.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E +.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE +.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE +.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE +.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE +.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E +.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E +.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE +.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ |