aboutsummaryrefslogtreecommitdiffstats
path: root/module/icp/asm-aarch64
diff options
context:
space:
mode:
authorTino Reichardt <[email protected]>2023-04-26 21:40:26 +0200
committerGitHub <[email protected]>2023-04-26 12:40:26 -0700
commitee728008a4279dbbbe5332f8b9a886f3b8d91e00 (patch)
tree545341a79b94509b9b027238c8d90a3c3a6dc560 /module/icp/asm-aarch64
parentb5411618f727c4ce5f787bb97d1c87f20c66027a (diff)
Fix BLAKE3 aarch64 assembly for FreeBSD and macOS
The x18 register isn't useable within FreeBSD kernel space, so we have to fix the BLAKE3 aarch64 assembly for not using it. The source files are here: https://github.com/mcmilk/BLAKE3-tests Reviewed-by: Kyle Evans <[email protected]> Signed-off-by: Tino Reichardt <[email protected]> Closes #14728
Diffstat (limited to 'module/icp/asm-aarch64')
-rw-r--r--module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S4163
-rw-r--r--module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S4405
2 files changed, 4057 insertions, 4511 deletions
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
index 8237f0eb5..dc2719d14 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S
@@ -22,480 +22,61 @@
/*
* Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
* Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale
- * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * Copyright (c) 2022-2023 Tino Reichardt <[email protected]>
*
* This is converted assembly: SSE2 -> ARMv8-A
* Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
*/
#if defined(__aarch64__)
.text
- .section .rodata.cst16,"aM",@progbits,16
- .p2align 4
-.LCPI0_0:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI0_1:
- .xword 0
- .xword -4294967296
-.LCPI0_2:
- .xword -1
- .xword 4294967295
+ .section .note.gnu.property,"a",@note
+ .p2align 3
+ .word 4
+ .word 16
+ .word 5
+ .asciz "GNU"
+ .word 3221225472
+ .word 4
+ .word 3
+ .word 0
+.Lsec_end0:
.text
.globl zfs_blake3_compress_in_place_sse2
.p2align 2
.type zfs_blake3_compress_in_place_sse2,@function
zfs_blake3_compress_in_place_sse2:
.cfi_startproc
- ldp q3, q2, [x0]
- ldp q5, q6, [x1]
- add x10, x1, #32
- lsr x11, x3, #32
- fmov s4, w3
- ld2 { v17.4s, v18.4s }, [x10]
- adrp x10, .LCPI0_2
- and w8, w2, #0xff
- mov v4.s[1], w11
- ldr q1, [x10, :lo12:.LCPI0_2]
- and w9, w4, #0xff
- adrp x12, .LCPI0_0
- mov v4.s[2], w8
- uzp1 v19.4s, v5.4s, v6.4s
- add v3.4s, v2.4s, v3.4s
- ldr q7, [x12, :lo12:.LCPI0_0]
- mov v4.s[3], w9
- add v3.4s, v3.4s, v19.4s
- uzp2 v5.4s, v5.4s, v6.4s
- ext v21.16b, v18.16b, v18.16b, #12
- uzp1 v6.4s, v19.4s, v19.4s
- ext v22.16b, v19.16b, v19.16b, #12
- eor v4.16b, v3.16b, v4.16b
- ext v20.16b, v17.16b, v17.16b, #12
- ext v6.16b, v6.16b, v19.16b, #8
- ext v19.16b, v19.16b, v22.16b, #12
- zip1 v22.2d, v21.2d, v5.2d
- rev32 v24.8h, v4.8h
- mov v4.16b, v1.16b
- zip2 v23.4s, v5.4s, v21.4s
- uzp2 v6.4s, v6.4s, v5.4s
- bsl v4.16b, v22.16b, v20.16b
- add v3.4s, v3.4s, v5.4s
- zip1 v5.4s, v23.4s, v20.4s
- zip1 v22.4s, v20.4s, v23.4s
- add v23.4s, v24.4s, v7.4s
- ext v7.16b, v6.16b, v6.16b, #4
- ext v25.16b, v4.16b, v4.16b, #12
- ext v5.16b, v22.16b, v5.16b, #8
- eor v2.16b, v23.16b, v2.16b
- uzp1 v4.4s, v4.4s, v25.4s
- uzp1 v22.4s, v7.4s, v7.4s
- ext v25.16b, v7.16b, v7.16b, #12
- ext v22.16b, v22.16b, v7.16b, #8
- ext v7.16b, v7.16b, v25.16b, #12
- ushr v25.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- orr v2.16b, v2.16b, v25.16b
- add v3.4s, v3.4s, v2.4s
- eor v24.16b, v3.16b, v24.16b
- add v3.4s, v3.4s, v17.4s
- ushr v17.4s, v24.4s, #8
- shl v18.4s, v24.4s, #24
- orr v17.16b, v18.16b, v17.16b
- add v18.4s, v17.4s, v23.4s
- eor v2.16b, v18.16b, v2.16b
- ushr v23.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- ext v3.16b, v3.16b, v3.16b, #12
- orr v2.16b, v2.16b, v23.16b
- ext v17.16b, v17.16b, v17.16b, #8
- add v3.4s, v2.4s, v3.4s
- adrp x11, .LCPI0_1
- eor v17.16b, v3.16b, v17.16b
- ldr q16, [x11, :lo12:.LCPI0_1]
- ext v18.16b, v18.16b, v18.16b, #4
- rev32 v24.8h, v17.8h
- movi v0.2d, #0xffffffff00000000
- add v23.4s, v3.4s, v21.4s
- mov v21.s[1], v20.s[2]
- add v20.4s, v18.4s, v24.4s
- bit v19.16b, v21.16b, v0.16b
- eor v3.16b, v20.16b, v2.16b
- uzp2 v2.4s, v22.4s, v19.4s
- zip1 v17.2d, v5.2d, v19.2d
- zip2 v18.4s, v19.4s, v5.4s
- ushr v21.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- ext v22.16b, v2.16b, v2.16b, #4
- bsl v16.16b, v4.16b, v17.16b
- zip1 v17.4s, v18.4s, v4.4s
- zip1 v18.4s, v4.4s, v18.4s
- orr v21.16b, v3.16b, v21.16b
- ext v25.16b, v16.16b, v16.16b, #12
- ext v3.16b, v18.16b, v17.16b, #8
- uzp1 v18.4s, v22.4s, v22.4s
- ext v26.16b, v22.16b, v22.16b, #12
- add v23.4s, v23.4s, v21.4s
- uzp1 v17.4s, v16.4s, v25.4s
- ext v16.16b, v18.16b, v22.16b, #8
- ext v18.16b, v22.16b, v26.16b, #12
- eor v22.16b, v23.16b, v24.16b
- add v6.4s, v23.4s, v6.4s
- ushr v23.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v23.16b
- add v20.4s, v22.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v23.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v21.16b, v21.16b, v23.16b
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v21.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v20.16b, v20.16b, v20.16b, #12
- add v6.4s, v6.4s, v19.4s
- rev32 v19.8h, v22.8h
- add v20.4s, v20.4s, v19.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v22.4s, v21.4s, #12
- shl v21.4s, v21.4s, #20
- orr v21.16b, v21.16b, v22.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ushr v22.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v22.16b
- add v20.4s, v19.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #12
- ushr v22.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- add v6.4s, v6.4s, v4.4s
- orr v21.16b, v21.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ext v20.16b, v20.16b, v20.16b, #4
- rev32 v19.8h, v19.8h
- add v20.4s, v20.4s, v19.4s
- add v6.4s, v6.4s, v5.4s
- mov v5.s[1], v4.s[2]
- eor v4.16b, v20.16b, v21.16b
- ushr v21.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- orr v21.16b, v4.16b, v21.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- add v2.4s, v6.4s, v2.4s
- ushr v6.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v6.16b, v19.16b, v6.16b
- add v19.4s, v6.4s, v20.4s
- eor v20.16b, v19.16b, v21.16b
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v20.4s, v2.4s
- eor v6.16b, v2.16b, v6.16b
- ext v19.16b, v19.16b, v19.16b, #12
- rev32 v6.8h, v6.8h
- add v19.4s, v19.4s, v6.4s
- mov v22.16b, v0.16b
- eor v20.16b, v19.16b, v20.16b
- bsl v22.16b, v5.16b, v7.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- add v2.4s, v2.4s, v22.4s
- orr v20.16b, v20.16b, v21.16b
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- ushr v21.4s, v6.4s, #8
- shl v6.4s, v6.4s, #24
- orr v6.16b, v6.16b, v21.16b
- add v19.4s, v6.4s, v19.4s
- eor v20.16b, v19.16b, v20.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v2.4s, v2.4s, v17.4s
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- uzp2 v5.4s, v16.4s, v22.4s
- zip1 v7.2d, v3.2d, v22.2d
- zip2 v16.4s, v22.4s, v3.4s
- ext v19.16b, v19.16b, v19.16b, #4
- rev32 v22.8h, v6.8h
- ext v23.16b, v5.16b, v5.16b, #4
- bif v7.16b, v17.16b, v1.16b
- zip1 v24.4s, v16.4s, v17.4s
- zip1 v16.4s, v17.4s, v16.4s
- add v21.4s, v2.4s, v3.4s
- mov v3.s[1], v17.s[2]
- add v17.4s, v19.4s, v22.4s
- mov v19.16b, v0.16b
- ext v25.16b, v7.16b, v7.16b, #12
- ext v4.16b, v16.16b, v24.16b, #8
- uzp1 v16.4s, v23.4s, v23.4s
- bsl v19.16b, v3.16b, v18.16b
- eor v2.16b, v17.16b, v20.16b
- uzp1 v7.4s, v7.4s, v25.4s
- ext v25.16b, v16.16b, v23.16b, #8
- zip1 v3.2d, v4.2d, v19.2d
- ushr v20.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- ext v24.16b, v23.16b, v23.16b, #12
- uzp2 v6.4s, v25.4s, v19.4s
- zip2 v18.4s, v19.4s, v4.4s
- bif v3.16b, v7.16b, v1.16b
- orr v20.16b, v2.16b, v20.16b
- ext v16.16b, v23.16b, v24.16b, #12
- ext v23.16b, v6.16b, v6.16b, #4
- zip1 v24.4s, v18.4s, v7.4s
- zip1 v18.4s, v7.4s, v18.4s
- ext v25.16b, v3.16b, v3.16b, #12
- add v21.4s, v21.4s, v20.4s
- ext v2.16b, v18.16b, v24.16b, #8
- uzp1 v18.4s, v23.4s, v23.4s
- ext v24.16b, v23.16b, v23.16b, #12
- uzp1 v3.4s, v3.4s, v25.4s
- eor v22.16b, v21.16b, v22.16b
- ext v25.16b, v18.16b, v23.16b, #8
- dup v18.4s, v2.s[3]
- ext v23.16b, v23.16b, v24.16b, #12
- add v5.4s, v21.4s, v5.4s
- trn1 v21.4s, v3.4s, v3.4s
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- ext v18.16b, v21.16b, v18.16b, #8
- orr v21.16b, v22.16b, v24.16b
- add v17.4s, v21.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v22.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v20.16b, v20.16b, v22.16b
- ext v21.16b, v21.16b, v21.16b, #8
- add v5.4s, v20.4s, v5.4s
- eor v21.16b, v5.16b, v21.16b
- ext v17.16b, v17.16b, v17.16b, #12
- add v5.4s, v5.4s, v19.4s
- rev32 v19.8h, v21.8h
- add v17.4s, v17.4s, v19.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v21.16b
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ushr v21.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v21.16b
- add v17.4s, v19.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ext v5.16b, v5.16b, v5.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v5.4s, v5.4s, v7.4s
- orr v20.16b, v20.16b, v21.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ext v17.16b, v17.16b, v17.16b, #4
- rev32 v22.8h, v19.8h
- add v21.4s, v5.4s, v4.4s
- mov v4.s[1], v7.s[2]
- add v19.4s, v17.4s, v22.4s
- bit v16.16b, v4.16b, v0.16b
- eor v5.16b, v19.16b, v20.16b
- uzp2 v4.4s, v25.4s, v16.4s
- zip1 v7.2d, v2.2d, v16.2d
- zip2 v17.4s, v16.4s, v2.4s
- ushr v20.4s, v5.4s, #12
- shl v5.4s, v5.4s, #20
- ext v24.16b, v4.16b, v4.16b, #4
- bif v7.16b, v3.16b, v1.16b
- zip1 v25.4s, v17.4s, v3.4s
- zip1 v17.4s, v3.4s, v17.4s
- orr v20.16b, v5.16b, v20.16b
- ext v26.16b, v7.16b, v7.16b, #12
- ext v5.16b, v17.16b, v25.16b, #8
- uzp1 v17.4s, v24.4s, v24.4s
- ext v25.16b, v24.16b, v24.16b, #12
- bit v23.16b, v18.16b, v0.16b
- add v21.4s, v21.4s, v20.4s
- uzp1 v7.4s, v7.4s, v26.4s
- ext v26.16b, v17.16b, v24.16b, #8
- ext v17.16b, v24.16b, v25.16b, #12
- eor v22.16b, v21.16b, v22.16b
- add v6.4s, v21.4s, v6.4s
- zip1 v21.2d, v5.2d, v23.2d
- zip2 v24.4s, v23.4s, v5.4s
- bif v21.16b, v7.16b, v1.16b
- zip1 v1.4s, v24.4s, v7.4s
- zip1 v24.4s, v7.4s, v24.4s
- ext v1.16b, v24.16b, v1.16b, #8
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v24.16b
- add v19.4s, v22.4s, v19.4s
- ext v24.16b, v21.16b, v21.16b, #12
- eor v20.16b, v19.16b, v20.16b
- uzp1 v21.4s, v21.4s, v24.4s
- ushr v24.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- orr v20.16b, v20.16b, v24.16b
- ext v6.16b, v6.16b, v6.16b, #4
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v20.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #12
- add v6.4s, v6.4s, v16.4s
- rev32 v16.8h, v22.8h
- add v19.4s, v19.4s, v16.4s
- eor v20.16b, v19.16b, v20.16b
- ushr v22.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v22.16b
- add v6.4s, v6.4s, v20.4s
- eor v16.16b, v6.16b, v16.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v6.4s, v3.4s
- ushr v6.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- orr v6.16b, v16.16b, v6.16b
- add v16.4s, v6.4s, v19.4s
- eor v19.16b, v16.16b, v20.16b
- ushr v20.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v20.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v3.4s, v3.4s, v19.4s
- eor v6.16b, v3.16b, v6.16b
- ext v16.16b, v16.16b, v16.16b, #4
- add v2.4s, v3.4s, v2.4s
- rev32 v3.8h, v6.8h
- add v6.4s, v16.4s, v3.4s
- eor v16.16b, v6.16b, v19.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- orr v16.16b, v16.16b, v19.16b
- add v2.4s, v2.4s, v16.4s
- eor v3.16b, v2.16b, v3.16b
- add v2.4s, v2.4s, v4.4s
- ushr v4.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v4.16b
- add v4.4s, v3.4s, v6.4s
- eor v6.16b, v4.16b, v16.16b
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v6.16b, v6.16b, v16.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v6.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #12
- rev32 v3.8h, v3.8h
- add v4.4s, v4.4s, v3.4s
- eor v6.16b, v4.16b, v6.16b
- ushr v16.4s, v6.4s, #12
- shl v6.4s, v6.4s, #20
- add v2.4s, v2.4s, v23.4s
- orr v6.16b, v6.16b, v16.16b
- add v2.4s, v2.4s, v6.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v16.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v16.16b
- add v4.4s, v3.4s, v4.4s
- eor v6.16b, v4.16b, v6.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- add v2.4s, v2.4s, v7.4s
- orr v6.16b, v6.16b, v16.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v2.4s, v6.4s
- eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #4
- rev32 v3.8h, v3.8h
- add v2.4s, v2.4s, v5.4s
- mov v5.s[1], v7.s[2]
- add v4.4s, v4.4s, v3.4s
- bsl v0.16b, v5.16b, v17.16b
- eor v5.16b, v4.16b, v6.16b
- ushr v6.4s, v5.4s, #12
- shl v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v6.16b
- add v2.4s, v2.4s, v5.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v6.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v6.16b
- add v4.4s, v3.4s, v4.4s
- uzp2 v18.4s, v26.4s, v18.4s
- eor v5.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v18.4s
- ushr v6.4s, v5.4s, #7
- shl v5.4s, v5.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v5.16b, v5.16b, v6.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v5.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #12
- add v0.4s, v2.4s, v0.4s
- rev32 v2.8h, v3.8h
- add v3.4s, v4.4s, v2.4s
- eor v4.16b, v3.16b, v5.16b
- ushr v5.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- orr v4.16b, v4.16b, v5.16b
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ushr v5.4s, v2.4s, #8
- shl v2.4s, v2.4s, #24
- orr v2.16b, v2.16b, v5.16b
- add v3.4s, v2.4s, v3.4s
- eor v4.16b, v3.16b, v4.16b
- ext v0.16b, v0.16b, v0.16b, #12
- ushr v5.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- add v0.4s, v0.4s, v21.4s
- orr v4.16b, v4.16b, v5.16b
- ext v2.16b, v2.16b, v2.16b, #8
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ext v3.16b, v3.16b, v3.16b, #4
- add v0.4s, v0.4s, v1.4s
- rev32 v1.8h, v2.8h
- add v2.4s, v3.4s, v1.4s
- eor v3.16b, v2.16b, v4.16b
- ushr v4.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- orr v3.16b, v3.16b, v4.16b
- add v0.4s, v0.4s, v3.4s
- eor v1.16b, v0.16b, v1.16b
- ushr v4.4s, v1.4s, #8
- shl v1.4s, v1.4s, #24
- orr v1.16b, v1.16b, v4.16b
- add v2.4s, v1.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ext v0.16b, v0.16b, v0.16b, #4
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v4.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- ext v1.16b, v1.16b, v1.16b, #8
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ str x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x19, x0
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x19
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
eor v0.16b, v2.16b, v0.16b
- orr v2.16b, v3.16b, v4.16b
- eor v1.16b, v2.16b, v1.16b
- stp q0, q1, [x0]
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
ret
.Lfunc_end0:
.size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2
@@ -504,483 +85,518 @@ zfs_blake3_compress_in_place_sse2:
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI1_0:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI1_1:
- .xword 0
- .xword -4294967296
-.LCPI1_2:
- .xword -1
- .xword 4294967295
+ .xword -4942790177982912921
+ .xword -6534734903820487822
.text
- .globl zfs_blake3_compress_xof_sse2
.p2align 2
- .type zfs_blake3_compress_xof_sse2,@function
-zfs_blake3_compress_xof_sse2:
+ .type compress_pre,@function
+compress_pre:
.cfi_startproc
- ldp q3, q2, [x0]
- ldp q5, q6, [x1]
- add x10, x1, #32
- lsr x11, x3, #32
- fmov s4, w3
- ld2 { v17.4s, v18.4s }, [x10]
- adrp x10, .LCPI1_2
- and w8, w2, #0xff
- mov v4.s[1], w11
- ldr q1, [x10, :lo12:.LCPI1_2]
- and w9, w4, #0xff
- adrp x12, .LCPI1_0
- mov v4.s[2], w8
- uzp1 v19.4s, v5.4s, v6.4s
- add v3.4s, v2.4s, v3.4s
- ldr q7, [x12, :lo12:.LCPI1_0]
- mov v4.s[3], w9
- add v3.4s, v3.4s, v19.4s
- uzp2 v5.4s, v5.4s, v6.4s
- ext v21.16b, v18.16b, v18.16b, #12
- uzp1 v6.4s, v19.4s, v19.4s
- ext v22.16b, v19.16b, v19.16b, #12
- eor v4.16b, v3.16b, v4.16b
- ext v20.16b, v17.16b, v17.16b, #12
- ext v6.16b, v6.16b, v19.16b, #8
- ext v19.16b, v19.16b, v22.16b, #12
- zip1 v22.2d, v21.2d, v5.2d
- rev32 v24.8h, v4.8h
- mov v4.16b, v1.16b
- zip2 v23.4s, v5.4s, v21.4s
- uzp2 v6.4s, v6.4s, v5.4s
- bsl v4.16b, v22.16b, v20.16b
- add v3.4s, v3.4s, v5.4s
- zip1 v5.4s, v23.4s, v20.4s
- zip1 v22.4s, v20.4s, v23.4s
- add v23.4s, v24.4s, v7.4s
- ext v7.16b, v6.16b, v6.16b, #4
- ext v25.16b, v4.16b, v4.16b, #12
- ext v5.16b, v22.16b, v5.16b, #8
- eor v2.16b, v23.16b, v2.16b
- uzp1 v4.4s, v4.4s, v25.4s
- uzp1 v22.4s, v7.4s, v7.4s
- ext v25.16b, v7.16b, v7.16b, #12
- ext v22.16b, v22.16b, v7.16b, #8
- ext v7.16b, v7.16b, v25.16b, #12
- ushr v25.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- orr v2.16b, v2.16b, v25.16b
- add v3.4s, v3.4s, v2.4s
- eor v24.16b, v3.16b, v24.16b
- add v3.4s, v3.4s, v17.4s
- ushr v17.4s, v24.4s, #8
- shl v18.4s, v24.4s, #24
- orr v17.16b, v18.16b, v17.16b
- add v18.4s, v17.4s, v23.4s
- eor v2.16b, v18.16b, v2.16b
- ushr v23.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- ext v3.16b, v3.16b, v3.16b, #12
- orr v2.16b, v2.16b, v23.16b
- ext v17.16b, v17.16b, v17.16b, #8
- add v3.4s, v2.4s, v3.4s
- adrp x11, .LCPI1_1
- eor v17.16b, v3.16b, v17.16b
- ldr q16, [x11, :lo12:.LCPI1_1]
- ext v18.16b, v18.16b, v18.16b, #4
- rev32 v24.8h, v17.8h
- movi v0.2d, #0xffffffff00000000
- add v23.4s, v3.4s, v21.4s
- mov v21.s[1], v20.s[2]
- add v20.4s, v18.4s, v24.4s
- bit v19.16b, v21.16b, v0.16b
- eor v3.16b, v20.16b, v2.16b
- uzp2 v2.4s, v22.4s, v19.4s
- zip1 v17.2d, v5.2d, v19.2d
- zip2 v18.4s, v19.4s, v5.4s
- ushr v21.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- ext v22.16b, v2.16b, v2.16b, #4
- bsl v16.16b, v4.16b, v17.16b
- zip1 v17.4s, v18.4s, v4.4s
- zip1 v18.4s, v4.4s, v18.4s
- orr v21.16b, v3.16b, v21.16b
- ext v25.16b, v16.16b, v16.16b, #12
- ext v3.16b, v18.16b, v17.16b, #8
- uzp1 v18.4s, v22.4s, v22.4s
- ext v26.16b, v22.16b, v22.16b, #12
- add v23.4s, v23.4s, v21.4s
- uzp1 v17.4s, v16.4s, v25.4s
- ext v16.16b, v18.16b, v22.16b, #8
- ext v18.16b, v22.16b, v26.16b, #12
- eor v22.16b, v23.16b, v24.16b
- add v6.4s, v23.4s, v6.4s
- ushr v23.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v23.16b
- add v20.4s, v22.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v23.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v21.16b, v21.16b, v23.16b
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v21.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v20.16b, v20.16b, v20.16b, #12
- add v6.4s, v6.4s, v19.4s
- rev32 v19.8h, v22.8h
- add v20.4s, v20.4s, v19.4s
- eor v21.16b, v20.16b, v21.16b
- ushr v22.4s, v21.4s, #12
- shl v21.4s, v21.4s, #20
- orr v21.16b, v21.16b, v22.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ushr v22.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v22.16b
- add v20.4s, v19.4s, v20.4s
- eor v21.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #12
- ushr v22.4s, v21.4s, #7
- shl v21.4s, v21.4s, #25
- add v6.4s, v6.4s, v4.4s
- orr v21.16b, v21.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- ext v20.16b, v20.16b, v20.16b, #4
- rev32 v19.8h, v19.8h
- add v20.4s, v20.4s, v19.4s
- add v6.4s, v6.4s, v5.4s
- mov v5.s[1], v4.s[2]
- eor v4.16b, v20.16b, v21.16b
- ushr v21.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- orr v21.16b, v4.16b, v21.16b
- add v6.4s, v6.4s, v21.4s
- eor v19.16b, v6.16b, v19.16b
- add v2.4s, v6.4s, v2.4s
- ushr v6.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v6.16b, v19.16b, v6.16b
- add v19.4s, v6.4s, v20.4s
- eor v20.16b, v19.16b, v21.16b
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v20.4s, v2.4s
- eor v6.16b, v2.16b, v6.16b
- ext v19.16b, v19.16b, v19.16b, #12
- rev32 v6.8h, v6.8h
- add v19.4s, v19.4s, v6.4s
- mov v22.16b, v0.16b
- eor v20.16b, v19.16b, v20.16b
- bsl v22.16b, v5.16b, v7.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- add v2.4s, v2.4s, v22.4s
- orr v20.16b, v20.16b, v21.16b
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- ushr v21.4s, v6.4s, #8
- shl v6.4s, v6.4s, #24
- orr v6.16b, v6.16b, v21.16b
- add v19.4s, v6.4s, v19.4s
- eor v20.16b, v19.16b, v20.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v2.4s, v2.4s, v17.4s
- orr v20.16b, v20.16b, v21.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v2.4s, v2.4s, v20.4s
- eor v6.16b, v2.16b, v6.16b
- uzp2 v5.4s, v16.4s, v22.4s
- zip1 v7.2d, v3.2d, v22.2d
- zip2 v16.4s, v22.4s, v3.4s
- ext v19.16b, v19.16b, v19.16b, #4
- rev32 v22.8h, v6.8h
- ext v23.16b, v5.16b, v5.16b, #4
- bif v7.16b, v17.16b, v1.16b
- zip1 v24.4s, v16.4s, v17.4s
- zip1 v16.4s, v17.4s, v16.4s
- add v21.4s, v2.4s, v3.4s
- mov v3.s[1], v17.s[2]
- add v17.4s, v19.4s, v22.4s
- mov v19.16b, v0.16b
- ext v25.16b, v7.16b, v7.16b, #12
- ext v4.16b, v16.16b, v24.16b, #8
- uzp1 v16.4s, v23.4s, v23.4s
- bsl v19.16b, v3.16b, v18.16b
- eor v2.16b, v17.16b, v20.16b
- uzp1 v7.4s, v7.4s, v25.4s
- ext v25.16b, v16.16b, v23.16b, #8
- zip1 v3.2d, v4.2d, v19.2d
- ushr v20.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- ext v24.16b, v23.16b, v23.16b, #12
- uzp2 v6.4s, v25.4s, v19.4s
- zip2 v18.4s, v19.4s, v4.4s
- bif v3.16b, v7.16b, v1.16b
- orr v20.16b, v2.16b, v20.16b
- ext v16.16b, v23.16b, v24.16b, #12
- ext v23.16b, v6.16b, v6.16b, #4
- zip1 v24.4s, v18.4s, v7.4s
- zip1 v18.4s, v7.4s, v18.4s
- ext v25.16b, v3.16b, v3.16b, #12
- add v21.4s, v21.4s, v20.4s
- ext v2.16b, v18.16b, v24.16b, #8
- uzp1 v18.4s, v23.4s, v23.4s
- ext v24.16b, v23.16b, v23.16b, #12
- uzp1 v3.4s, v3.4s, v25.4s
- eor v22.16b, v21.16b, v22.16b
- ext v25.16b, v18.16b, v23.16b, #8
- dup v18.4s, v2.s[3]
- ext v23.16b, v23.16b, v24.16b, #12
- add v5.4s, v21.4s, v5.4s
- trn1 v21.4s, v3.4s, v3.4s
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- ext v18.16b, v21.16b, v18.16b, #8
- orr v21.16b, v22.16b, v24.16b
- add v17.4s, v21.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v22.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v20.16b, v20.16b, v22.16b
- ext v21.16b, v21.16b, v21.16b, #8
- add v5.4s, v20.4s, v5.4s
- eor v21.16b, v5.16b, v21.16b
- ext v17.16b, v17.16b, v17.16b, #12
- add v5.4s, v5.4s, v19.4s
- rev32 v19.8h, v21.8h
- add v17.4s, v17.4s, v19.4s
- eor v20.16b, v17.16b, v20.16b
- ushr v21.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v21.16b
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ushr v21.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v21.16b
- add v17.4s, v19.4s, v17.4s
- eor v20.16b, v17.16b, v20.16b
- ext v5.16b, v5.16b, v5.16b, #12
- ushr v21.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- add v5.4s, v5.4s, v7.4s
- orr v20.16b, v20.16b, v21.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v5.4s, v5.4s, v20.4s
- eor v19.16b, v5.16b, v19.16b
- ext v17.16b, v17.16b, v17.16b, #4
- rev32 v22.8h, v19.8h
- add v21.4s, v5.4s, v4.4s
- mov v4.s[1], v7.s[2]
- add v19.4s, v17.4s, v22.4s
- bit v16.16b, v4.16b, v0.16b
- eor v5.16b, v19.16b, v20.16b
- uzp2 v4.4s, v25.4s, v16.4s
- zip1 v7.2d, v2.2d, v16.2d
- zip2 v17.4s, v16.4s, v2.4s
- ushr v20.4s, v5.4s, #12
- shl v5.4s, v5.4s, #20
- ext v24.16b, v4.16b, v4.16b, #4
- bif v7.16b, v3.16b, v1.16b
- zip1 v25.4s, v17.4s, v3.4s
- zip1 v17.4s, v3.4s, v17.4s
- orr v20.16b, v5.16b, v20.16b
- ext v26.16b, v7.16b, v7.16b, #12
- ext v5.16b, v17.16b, v25.16b, #8
- uzp1 v17.4s, v24.4s, v24.4s
- ext v25.16b, v24.16b, v24.16b, #12
- bit v23.16b, v18.16b, v0.16b
- add v21.4s, v21.4s, v20.4s
- uzp1 v7.4s, v7.4s, v26.4s
- ext v26.16b, v17.16b, v24.16b, #8
- ext v17.16b, v24.16b, v25.16b, #12
- eor v22.16b, v21.16b, v22.16b
- add v6.4s, v21.4s, v6.4s
- zip1 v21.2d, v5.2d, v23.2d
- zip2 v24.4s, v23.4s, v5.4s
- bif v21.16b, v7.16b, v1.16b
- zip1 v1.4s, v24.4s, v7.4s
- zip1 v24.4s, v7.4s, v24.4s
- ext v1.16b, v24.16b, v1.16b, #8
- ushr v24.4s, v22.4s, #8
- shl v22.4s, v22.4s, #24
- orr v22.16b, v22.16b, v24.16b
- add v19.4s, v22.4s, v19.4s
- ext v24.16b, v21.16b, v21.16b, #12
- eor v20.16b, v19.16b, v20.16b
- uzp1 v21.4s, v21.4s, v24.4s
- ushr v24.4s, v20.4s, #7
- shl v20.4s, v20.4s, #25
- orr v20.16b, v20.16b, v24.16b
- ext v6.16b, v6.16b, v6.16b, #4
- ext v22.16b, v22.16b, v22.16b, #8
- add v6.4s, v20.4s, v6.4s
- eor v22.16b, v6.16b, v22.16b
- ext v19.16b, v19.16b, v19.16b, #12
- add v6.4s, v6.4s, v16.4s
- rev32 v16.8h, v22.8h
- add v19.4s, v19.4s, v16.4s
- eor v20.16b, v19.16b, v20.16b
- ushr v22.4s, v20.4s, #12
- shl v20.4s, v20.4s, #20
- orr v20.16b, v20.16b, v22.16b
- add v6.4s, v6.4s, v20.4s
- eor v16.16b, v6.16b, v16.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v6.4s, v3.4s
- ushr v6.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- orr v6.16b, v16.16b, v6.16b
- add v16.4s, v6.4s, v19.4s
- eor v19.16b, v16.16b, v20.16b
- ushr v20.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v20.16b
- ext v6.16b, v6.16b, v6.16b, #8
- add v3.4s, v3.4s, v19.4s
- eor v6.16b, v3.16b, v6.16b
- ext v16.16b, v16.16b, v16.16b, #4
- add v2.4s, v3.4s, v2.4s
- rev32 v3.8h, v6.8h
- add v6.4s, v16.4s, v3.4s
- eor v16.16b, v6.16b, v19.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- orr v16.16b, v16.16b, v19.16b
- add v2.4s, v2.4s, v16.4s
- eor v3.16b, v2.16b, v3.16b
- add v2.4s, v2.4s, v4.4s
- ushr v4.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v4.16b
- add v4.4s, v3.4s, v6.4s
- eor v6.16b, v4.16b, v16.16b
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- ext v2.16b, v2.16b, v2.16b, #4
- orr v6.16b, v6.16b, v16.16b
- ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v6.4s, v2.4s
+ hint #34
+ fmov s1, w3
+ movi d0, #0x0000ff000000ff
+ ldr q2, [x1]
+ fmov d3, x4
+ adrp x8, .LCPI1_0
+ mov v1.s[1], w5
+ str q2, [x0]
+ ldr q4, [x8, :lo12:.LCPI1_0]
+ add x8, x2, #32
+ ldr q5, [x1, #16]
+ and v0.8b, v1.8b, v0.8b
+ stp q5, q4, [x0, #16]
+ mov v3.d[1], v0.d[0]
+ str q3, [x0, #48]
+ ldp q0, q6, [x2]
+ uzp1 v1.4s, v0.4s, v6.4s
+ uzp2 v0.4s, v0.4s, v6.4s
+ add v2.4s, v2.4s, v1.4s
+ uzp1 v18.4s, v1.4s, v1.4s
+ add v2.4s, v2.4s, v5.4s
eor v3.16b, v2.16b, v3.16b
- ext v4.16b, v4.16b, v4.16b, #12
+ add v2.4s, v2.4s, v0.4s
rev32 v3.8h, v3.8h
- add v4.4s, v4.4s, v3.4s
- eor v6.16b, v4.16b, v6.16b
- ushr v16.4s, v6.4s, #12
- shl v6.4s, v6.4s, #20
- add v2.4s, v2.4s, v23.4s
- orr v6.16b, v6.16b, v16.16b
- add v2.4s, v2.4s, v6.4s
+ add v4.4s, v3.4s, v4.4s
+ eor v5.16b, v4.16b, v5.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v5.4s
eor v3.16b, v2.16b, v3.16b
- ushr v16.4s, v3.4s, #8
+ ushr v6.4s, v3.4s, #8
shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v16.16b
+ orr v3.16b, v3.16b, v6.16b
+ ld2 { v6.4s, v7.4s }, [x8]
add v4.4s, v3.4s, v4.4s
- eor v6.16b, v4.16b, v6.16b
- ext v2.16b, v2.16b, v2.16b, #12
- ushr v16.4s, v6.4s, #7
- shl v6.4s, v6.4s, #25
- add v2.4s, v2.4s, v7.4s
- orr v6.16b, v6.16b, v16.16b
ext v3.16b, v3.16b, v3.16b, #8
add v2.4s, v2.4s, v6.4s
- eor v3.16b, v2.16b, v3.16b
+ eor v5.16b, v4.16b, v5.16b
ext v4.16b, v4.16b, v4.16b, #4
- rev32 v3.8h, v3.8h
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v16.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v7.16b, v7.16b, #12
add v2.4s, v2.4s, v5.4s
- mov v5.s[1], v7.s[2]
+ mov v7.16b, v16.16b
+ eor v3.16b, v3.16b, v2.16b
+ add v2.4s, v2.4s, v16.4s
+ mov v7.s[1], v6.s[2]
+ rev32 v3.8h, v3.8h
add v4.4s, v4.4s, v3.4s
- bsl v0.16b, v5.16b, v17.16b
- eor v5.16b, v4.16b, v6.16b
- ushr v6.4s, v5.4s, #12
+ eor v5.16b, v4.16b, v5.16b
+ ushr v17.4s, v5.4s, #12
shl v5.4s, v5.4s, #20
- orr v5.16b, v5.16b, v6.16b
+ orr v5.16b, v5.16b, v17.16b
add v2.4s, v2.4s, v5.4s
eor v3.16b, v2.16b, v3.16b
- ushr v6.4s, v3.4s, #8
+ ushr v17.4s, v3.4s, #8
shl v3.4s, v3.4s, #24
- orr v3.16b, v3.16b, v6.16b
+ orr v3.16b, v3.16b, v17.16b
+ ext v17.16b, v18.16b, v1.16b, #8
add v4.4s, v3.4s, v4.4s
- uzp2 v18.4s, v26.4s, v18.4s
+ uzp2 v17.4s, v17.4s, v0.4s
+ ext v3.16b, v3.16b, v3.16b, #8
eor v5.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v18.4s
- ushr v6.4s, v5.4s, #7
+ add v2.4s, v2.4s, v17.4s
+ ext v4.16b, v4.16b, v4.16b, #12
+ ushr v18.4s, v5.4s, #7
shl v5.4s, v5.4s, #25
ext v2.16b, v2.16b, v2.16b, #4
- orr v5.16b, v5.16b, v6.16b
+ orr v5.16b, v5.16b, v18.16b
+ ext v18.16b, v1.16b, v1.16b, #12
+ add v2.4s, v2.4s, v5.4s
+ ext v1.16b, v1.16b, v18.16b, #12
+ zip1 v18.2d, v16.2d, v0.2d
+ zip2 v0.4s, v0.4s, v16.4s
+ eor v3.16b, v3.16b, v2.16b
+ rev64 v1.4s, v1.4s
+ mov v18.s[3], v6.s[3]
+ zip1 v16.4s, v0.4s, v6.4s
+ rev32 v3.8h, v3.8h
+ trn2 v1.4s, v1.4s, v7.4s
+ zip1 v0.4s, v6.4s, v0.4s
+ add v4.4s, v4.4s, v3.4s
+ add v2.4s, v2.4s, v1.4s
+ ext v6.16b, v0.16b, v16.16b, #8
+ eor v5.16b, v4.16b, v5.16b
+ ushr v7.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v7.16b
+ add v7.4s, v2.4s, v5.4s
+ eor v2.16b, v7.16b, v3.16b
+ ext v7.16b, v7.16b, v7.16b, #12
+ ushr v3.4s, v2.4s, #8
+ shl v2.4s, v2.4s, #24
+ orr v3.16b, v2.16b, v3.16b
+ ext v2.16b, v18.16b, v18.16b, #12
+ add v4.4s, v3.4s, v4.4s
+ uzp1 v2.4s, v18.4s, v2.4s
ext v3.16b, v3.16b, v3.16b, #8
- add v2.4s, v5.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
+ eor v5.16b, v4.16b, v5.16b
+ add v7.4s, v7.4s, v2.4s
+ ext v4.16b, v4.16b, v4.16b, #4
+ ushr v18.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v18.16b
+ add v7.4s, v7.4s, v5.4s
+ eor v3.16b, v3.16b, v7.16b
+ add v7.4s, v7.4s, v6.4s
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v5.16b, v4.16b, v5.16b
+ ushr v0.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v0.16b, v5.16b, v0.16b
+ add v5.4s, v7.4s, v0.4s
+ ext v7.16b, v17.16b, v17.16b, #4
+ eor v3.16b, v5.16b, v3.16b
+ uzp1 v17.4s, v7.4s, v7.4s
+ ushr v16.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v3.16b, v3.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v4.4s, v3.4s, v4.4s
+ uzp2 v16.4s, v16.4s, v1.4s
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v0.16b, v4.16b, v0.16b
+ add v5.4s, v5.4s, v16.4s
+ ext v4.16b, v4.16b, v4.16b, #12
+ ushr v17.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v0.16b, v0.16b, v17.16b
+ ext v17.16b, v7.16b, v7.16b, #12
+ add v5.4s, v5.4s, v0.4s
+ ext v7.16b, v7.16b, v17.16b, #12
+ mov v17.16b, v6.16b
+ eor v3.16b, v3.16b, v5.16b
+ rev64 v7.4s, v7.4s
+ mov v17.s[1], v2.s[2]
+ rev32 v3.8h, v3.8h
+ add v4.4s, v4.4s, v3.4s
+ eor v18.16b, v4.16b, v0.16b
+ trn2 v0.4s, v7.4s, v17.4s
+ ushr v7.4s, v18.4s, #12
+ shl v17.4s, v18.4s, #20
+ add v5.4s, v5.4s, v0.4s
+ zip1 v18.2d, v6.2d, v1.2d
+ zip2 v1.4s, v1.4s, v6.4s
+ orr v7.16b, v17.16b, v7.16b
+ mov v18.s[3], v2.s[3]
+ zip1 v6.4s, v1.4s, v2.4s
+ add v5.4s, v5.4s, v7.4s
+ zip1 v1.4s, v2.4s, v1.4s
+ eor v3.16b, v5.16b, v3.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ext v6.16b, v1.16b, v6.16b, #8
+ ushr v17.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
+ orr v17.16b, v3.16b, v17.16b
+ ext v3.16b, v18.16b, v18.16b, #12
+ add v4.4s, v17.4s, v4.4s
+ uzp1 v3.4s, v18.4s, v3.4s
+ ext v17.16b, v17.16b, v17.16b, #8
+ eor v7.16b, v4.16b, v7.16b
+ add v5.4s, v5.4s, v3.4s
+ ext v4.16b, v4.16b, v4.16b, #4
+ ushr v18.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ orr v7.16b, v7.16b, v18.16b
+ add v5.4s, v5.4s, v7.4s
+ eor v17.16b, v17.16b, v5.16b
+ add v5.4s, v5.4s, v6.4s
+ rev32 v17.8h, v17.8h
+ add v4.4s, v4.4s, v17.4s
+ eor v2.16b, v4.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #4
+ ushr v1.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v2.16b, v1.16b
+ add v2.4s, v5.4s, v1.4s
+ eor v5.16b, v2.16b, v17.16b
+ uzp1 v17.4s, v7.4s, v7.4s
+ ushr v16.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v4.4s, v5.4s, v4.4s
+ uzp2 v16.4s, v16.4s, v0.4s
+ ext v5.16b, v5.16b, v5.16b, #8
+ eor v1.16b, v4.16b, v1.16b
+ add v2.4s, v2.4s, v16.4s
ext v4.16b, v4.16b, v4.16b, #12
- add v0.4s, v2.4s, v0.4s
- rev32 v2.8h, v3.8h
- add v3.4s, v4.4s, v2.4s
- eor v4.16b, v3.16b, v5.16b
- ushr v5.4s, v4.4s, #12
+ ushr v17.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v1.16b, v1.16b, v17.16b
+ ext v17.16b, v7.16b, v7.16b, #12
+ add v2.4s, v2.4s, v1.4s
+ ext v7.16b, v7.16b, v17.16b, #12
+ mov v17.16b, v6.16b
+ eor v5.16b, v5.16b, v2.16b
+ rev64 v7.4s, v7.4s
+ mov v17.s[1], v3.s[2]
+ rev32 v5.8h, v5.8h
+ add v4.4s, v4.4s, v5.4s
+ eor v18.16b, v4.16b, v1.16b
+ trn2 v1.4s, v7.4s, v17.4s
+ ushr v7.4s, v18.4s, #12
+ shl v17.4s, v18.4s, #20
+ add v2.4s, v2.4s, v1.4s
+ zip1 v18.2d, v6.2d, v0.2d
+ zip2 v0.4s, v0.4s, v6.4s
+ orr v7.16b, v17.16b, v7.16b
+ mov v18.s[3], v3.s[3]
+ add v2.4s, v2.4s, v7.4s
+ eor v5.16b, v2.16b, v5.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v17.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ orr v5.16b, v5.16b, v17.16b
+ add v17.4s, v5.4s, v4.4s
+ ext v4.16b, v18.16b, v18.16b, #12
+ ext v5.16b, v5.16b, v5.16b, #8
+ eor v7.16b, v17.16b, v7.16b
+ uzp1 v4.4s, v18.4s, v4.4s
+ ext v17.16b, v17.16b, v17.16b, #4
+ ushr v18.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ add v2.4s, v2.4s, v4.4s
+ orr v7.16b, v7.16b, v18.16b
+ add v2.4s, v2.4s, v7.4s
+ eor v5.16b, v5.16b, v2.16b
+ rev32 v5.8h, v5.8h
+ add v6.4s, v17.4s, v5.4s
+ zip1 v17.4s, v0.4s, v3.4s
+ zip1 v0.4s, v3.4s, v0.4s
+ eor v3.16b, v6.16b, v7.16b
+ ext v0.16b, v0.16b, v17.16b, #8
+ ushr v7.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v2.4s, v2.4s, v0.4s
+ orr v3.16b, v3.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #4
+ add v2.4s, v2.4s, v3.4s
+ uzp1 v17.4s, v7.4s, v7.4s
+ eor v5.16b, v2.16b, v5.16b
+ ushr v16.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v6.4s, v5.4s, v6.4s
+ uzp2 v16.4s, v16.4s, v1.4s
+ ext v5.16b, v5.16b, v5.16b, #8
+ eor v3.16b, v6.16b, v3.16b
+ add v2.4s, v2.4s, v16.4s
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v17.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ ext v2.16b, v2.16b, v2.16b, #4
+ orr v3.16b, v3.16b, v17.16b
+ add v17.4s, v2.4s, v3.4s
+ eor v2.16b, v5.16b, v17.16b
+ ext v5.16b, v7.16b, v7.16b, #12
+ rev32 v18.8h, v2.8h
+ ext v2.16b, v7.16b, v5.16b, #12
+ mov v5.16b, v0.16b
+ add v6.4s, v6.4s, v18.4s
+ rev64 v2.4s, v2.4s
+ mov v5.s[1], v4.s[2]
+ eor v3.16b, v6.16b, v3.16b
+ trn2 v2.4s, v2.4s, v5.4s
+ ushr v5.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v7.4s, v17.4s, v2.4s
+ orr v3.16b, v3.16b, v5.16b
+ add v5.4s, v7.4s, v3.4s
+ eor v7.16b, v5.16b, v18.16b
+ zip1 v18.2d, v0.2d, v1.2d
+ ext v5.16b, v5.16b, v5.16b, #12
+ zip2 v0.4s, v1.4s, v0.4s
+ ushr v17.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ mov v18.s[3], v4.s[3]
+ orr v7.16b, v7.16b, v17.16b
+ ext v17.16b, v18.16b, v18.16b, #12
+ add v6.4s, v7.4s, v6.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v19.16b, v6.16b, v3.16b
+ uzp1 v3.4s, v18.4s, v17.4s
+ ext v6.16b, v6.16b, v6.16b, #4
+ ushr v17.4s, v19.4s, #7
+ shl v18.4s, v19.4s, #25
+ add v5.4s, v5.4s, v3.4s
+ orr v17.16b, v18.16b, v17.16b
+ add v5.4s, v5.4s, v17.4s
+ eor v7.16b, v7.16b, v5.16b
+ rev32 v7.8h, v7.8h
+ add v1.4s, v6.4s, v7.4s
+ zip1 v6.4s, v0.4s, v4.4s
+ zip1 v0.4s, v4.4s, v0.4s
+ eor v4.16b, v1.16b, v17.16b
+ ext v6.16b, v0.16b, v6.16b, #8
+ ushr v0.4s, v4.4s, #12
shl v4.4s, v4.4s, #20
- orr v4.16b, v4.16b, v5.16b
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ushr v5.4s, v2.4s, #8
- shl v2.4s, v2.4s, #24
- orr v2.16b, v2.16b, v5.16b
- add v3.4s, v2.4s, v3.4s
- eor v4.16b, v3.16b, v4.16b
- ext v0.16b, v0.16b, v0.16b, #12
- ushr v5.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- add v0.4s, v0.4s, v21.4s
- orr v4.16b, v4.16b, v5.16b
- ext v2.16b, v2.16b, v2.16b, #8
- add v0.4s, v0.4s, v4.4s
- eor v2.16b, v0.16b, v2.16b
- ext v3.16b, v3.16b, v3.16b, #4
- add v0.4s, v0.4s, v1.4s
- rev32 v1.8h, v2.8h
- add v2.4s, v3.4s, v1.4s
- eor v3.16b, v2.16b, v4.16b
- ushr v4.4s, v3.4s, #12
+ add v5.4s, v5.4s, v6.4s
+ zip1 v20.2d, v6.2d, v2.2d
+ orr v0.16b, v4.16b, v0.16b
+ mov v20.s[3], v3.s[3]
+ add v4.4s, v5.4s, v0.4s
+ eor v5.16b, v4.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #4
+ ushr v16.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ uzp1 v17.4s, v7.4s, v7.4s
+ orr v5.16b, v5.16b, v16.16b
+ ext v16.16b, v17.16b, v7.16b, #8
+ add v1.4s, v5.4s, v1.4s
+ uzp2 v16.4s, v16.4s, v2.4s
+ zip2 v2.4s, v2.4s, v6.4s
+ eor v0.16b, v1.16b, v0.16b
+ add v4.4s, v4.4s, v16.4s
+ ext v1.16b, v1.16b, v1.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v17.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ext v4.16b, v4.16b, v4.16b, #4
+ orr v17.16b, v0.16b, v17.16b
+ ext v0.16b, v5.16b, v5.16b, #8
+ ext v5.16b, v7.16b, v7.16b, #12
+ add v4.4s, v4.4s, v17.4s
+ eor v0.16b, v0.16b, v4.16b
+ rev32 v18.8h, v0.8h
+ ext v0.16b, v7.16b, v5.16b, #12
+ mov v5.16b, v6.16b
+ add v7.4s, v1.4s, v18.4s
+ rev64 v1.4s, v0.4s
+ mov v5.s[1], v3.s[2]
+ eor v17.16b, v7.16b, v17.16b
+ trn2 v1.4s, v1.4s, v5.4s
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v4.4s, v4.4s, v1.4s
+ orr v17.16b, v17.16b, v19.16b
+ add v19.4s, v4.4s, v17.4s
+ eor v4.16b, v19.16b, v18.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ ushr v18.4s, v4.4s, #8
+ shl v4.4s, v4.4s, #24
+ orr v18.16b, v4.16b, v18.16b
+ ext v4.16b, v20.16b, v20.16b, #12
+ add v7.4s, v18.4s, v7.4s
+ uzp1 v4.4s, v20.4s, v4.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v17.16b, v7.16b, v17.16b
+ add v19.4s, v19.4s, v4.4s
+ ext v7.16b, v7.16b, v7.16b, #4
+ ushr v20.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ orr v17.16b, v17.16b, v20.16b
+ add v19.4s, v19.4s, v17.4s
+ eor v18.16b, v18.16b, v19.16b
+ rev32 v18.8h, v18.8h
+ add v6.4s, v7.4s, v18.4s
+ zip1 v7.4s, v2.4s, v3.4s
+ zip1 v2.4s, v3.4s, v2.4s
+ eor v3.16b, v6.16b, v17.16b
+ ext v2.16b, v2.16b, v7.16b, #8
+ ushr v7.4s, v3.4s, #12
shl v3.4s, v3.4s, #20
- orr v3.16b, v3.16b, v4.16b
- add v0.4s, v0.4s, v3.4s
- eor v1.16b, v0.16b, v1.16b
- ushr v4.4s, v1.4s, #8
- shl v1.4s, v1.4s, #24
- orr v1.16b, v1.16b, v4.16b
- add v2.4s, v1.4s, v2.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v4.4s, v3.4s, #7
+ add v17.4s, v19.4s, v2.4s
+ zip1 v1.2d, v2.2d, v1.2d
+ zip2 v0.4s, v0.4s, v2.4s
+ orr v3.16b, v3.16b, v7.16b
+ mov v1.s[3], v4.s[3]
+ add v7.4s, v17.4s, v3.4s
+ eor v17.16b, v7.16b, v18.16b
+ ext v7.16b, v7.16b, v7.16b, #4
+ ushr v18.4s, v17.4s, #8
+ shl v17.4s, v17.4s, #24
+ orr v17.16b, v17.16b, v18.16b
+ ext v18.16b, v16.16b, v16.16b, #8
+ add v6.4s, v17.4s, v6.4s
+ uzp2 v5.4s, v18.4s, v5.4s
+ eor v3.16b, v6.16b, v3.16b
+ ext v5.16b, v5.16b, v18.16b, #4
+ ext v6.16b, v6.16b, v6.16b, #12
+ ushr v18.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ add v5.4s, v7.4s, v5.4s
+ ext v7.16b, v17.16b, v17.16b, #8
+ ext v17.16b, v16.16b, v16.16b, #12
+ orr v3.16b, v3.16b, v18.16b
+ ext v16.16b, v16.16b, v17.16b, #12
+ add v5.4s, v3.4s, v5.4s
+ mov v17.16b, v2.16b
+ rev64 v16.4s, v16.4s
+ eor v7.16b, v7.16b, v5.16b
+ mov v17.s[1], v4.s[2]
+ rev32 v7.8h, v7.8h
+ trn2 v16.4s, v16.4s, v17.4s
+ add v6.4s, v6.4s, v7.4s
+ add v5.4s, v5.4s, v16.4s
+ eor v3.16b, v6.16b, v3.16b
+ ushr v17.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ orr v3.16b, v3.16b, v17.16b
+ add v5.4s, v5.4s, v3.4s
+ eor v7.16b, v5.16b, v7.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ ushr v16.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ orr v7.16b, v7.16b, v16.16b
+ ext v16.16b, v1.16b, v1.16b, #12
+ add v6.4s, v7.4s, v6.4s
+ uzp1 v1.4s, v1.4s, v16.4s
+ eor v3.16b, v6.16b, v3.16b
+ add v1.4s, v5.4s, v1.4s
+ ext v5.16b, v7.16b, v7.16b, #8
+ ext v6.16b, v6.16b, v6.16b, #4
+ ushr v16.4s, v3.4s, #7
shl v3.4s, v3.4s, #25
+ orr v3.16b, v3.16b, v16.16b
+ add v1.4s, v1.4s, v3.4s
+ eor v5.16b, v5.16b, v1.16b
+ rev32 v5.8h, v5.8h
+ add v2.4s, v6.4s, v5.4s
+ zip1 v6.4s, v0.4s, v4.4s
+ zip1 v0.4s, v4.4s, v0.4s
+ eor v3.16b, v2.16b, v3.16b
+ ext v0.16b, v0.16b, v6.16b, #8
+ ushr v4.4s, v3.4s, #12
+ shl v3.4s, v3.4s, #20
+ add v0.4s, v1.4s, v0.4s
+ orr v1.16b, v3.16b, v4.16b
+ add v0.4s, v0.4s, v1.4s
+ eor v3.16b, v0.16b, v5.16b
ext v0.16b, v0.16b, v0.16b, #4
- ext v1.16b, v1.16b, v1.16b, #8
- ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v3.4s, #8
+ shl v3.4s, v3.4s, #24
orr v3.16b, v3.16b, v4.16b
+ add v2.4s, v3.4s, v2.4s
+ ext v3.16b, v3.16b, v3.16b, #8
+ eor v1.16b, v2.16b, v1.16b
+ ext v2.16b, v2.16b, v2.16b, #12
+ ushr v4.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ stp q2, q3, [x0, #32]
+ orr v1.16b, v1.16b, v4.16b
+ stp q0, q1, [x0]
+ ret
+.Lfunc_end1:
+ .size compress_pre, .Lfunc_end1-compress_pre
+ .cfi_endproc
+
+ .globl zfs_blake3_compress_xof_sse2
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse2,@function
+zfs_blake3_compress_xof_sse2:
+ .cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ stp x20, x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x20, x0
+ mov x19, x5
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x20
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
eor v0.16b, v2.16b, v0.16b
- eor v3.16b, v3.16b, v1.16b
- stp q0, q3, [x5]
- ldr q0, [x0]
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr q0, [x20]
eor v0.16b, v0.16b, v2.16b
- str q0, [x5, #32]
- ldr q0, [x0, #16]
- eor v0.16b, v0.16b, v1.16b
- str q0, [x5, #48]
+ str q0, [x19, #32]
+ ldr q0, [x20, #16]
+ eor v0.16b, v0.16b, v3.16b
+ str q0, [x19, #48]
+ ldp x20, x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
ret
-.Lfunc_end1:
- .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2
+.Lfunc_end2:
+ .size zfs_blake3_compress_xof_sse2, .Lfunc_end2-zfs_blake3_compress_xof_sse2
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
-.LCPI2_0:
+.LCPI3_0:
.word 0
.word 1
.word 2
@@ -991,19 +607,21 @@ zfs_blake3_compress_xof_sse2:
.type zfs_blake3_hash_many_sse2,@function
zfs_blake3_hash_many_sse2:
.cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
stp d15, d14, [sp, #-160]!
stp d13, d12, [sp, #16]
stp d11, d10, [sp, #32]
stp d9, d8, [sp, #48]
stp x29, x30, [sp, #64]
+ add x29, sp, #64
stp x28, x27, [sp, #80]
stp x26, x25, [sp, #96]
stp x24, x23, [sp, #112]
stp x22, x21, [sp, #128]
stp x20, x19, [sp, #144]
- mov x29, sp
- sub sp, sp, #384
- .cfi_def_cfa w29, 160
+ sub sp, sp, #464
+ .cfi_def_cfa w29, 96
.cfi_offset w19, -8
.cfi_offset w20, -16
.cfi_offset w21, -24
@@ -1024,1414 +642,1406 @@ zfs_blake3_hash_many_sse2:
.cfi_offset b13, -144
.cfi_offset b14, -152
.cfi_offset b15, -160
- ldr x26, [x29, #168]
- ldrb w27, [x29, #160]
mov w19, w6
mov x20, x4
- mov x22, x2
- mov x28, x1
+ mov x24, x1
+ ldr x26, [x29, #104]
+ ldrb w27, [x29, #96]
cmp x1, #4
- mov x24, x0
str x3, [sp, #40]
- b.lo .LBB2_8
- adrp x9, .LCPI2_0
- ldr q0, [x9, :lo12:.LCPI2_0]
- sbfx w11, w5, #0, #1
- dup v1.4s, w11
- mov w9, #58983
+ b.lo .LBB3_6
+ adrp x8, .LCPI3_0
+ sbfx w9, w5, #0, #1
mov w10, #44677
- and v0.16b, v1.16b, v0.16b
mov w11, #62322
- mov w12, #62778
- orr w8, w7, w19
- movk w9, #27145, lsl #16
movk w10, #47975, lsl #16
movk w11, #15470, lsl #16
+ ldr q0, [x8, :lo12:.LCPI3_0]
+ dup v1.4s, w9
+ mov w9, #58983
+ orr w8, w7, w19
+ movk w9, #27145, lsl #16
+ and v0.16b, v1.16b, v0.16b
+ dup v1.4s, w11
+ movi v24.4s, #64
+ dup v2.4s, w9
+ mov w9, #62778
+ movk w9, #42319, lsl #16
str q0, [sp, #16]
orr v0.4s, #128, lsl #24
- movk w12, #42319, lsl #16
+ stp q2, q1, [sp, #48]
str q0, [sp]
-.LBB2_2:
- ldr x0, [sp, #40]
- mov x13, x0
- ld1r { v20.4s }, [x13], #4
- add x14, x0, #8
- add x15, x0, #12
- add x16, x0, #16
- add x17, x0, #20
- add x18, x0, #24
- add x0, x0, #28
- ld1r { v17.4s }, [x14]
- ld1r { v6.4s }, [x15]
- ld1r { v8.4s }, [x16]
- ld1r { v9.4s }, [x17]
- ld1r { v31.4s }, [x18]
- ld1r { v26.4s }, [x13]
- ld1r { v15.4s }, [x0]
- cbz x22, .LBB2_7
+ dup v0.4s, w10
+ str q0, [sp, #80]
+ b .LBB3_3
+.LBB3_2:
+ zip1 v0.4s, v12.4s, v31.4s
+ add x10, x20, #4
+ zip1 v1.4s, v29.4s, v30.4s
+ tst w5, #0x1
+ zip1 v2.4s, v28.4s, v23.4s
+ csel x20, x10, x20, ne
+ zip1 v3.4s, v13.4s, v25.4s
+ add x0, x0, #32
+ zip2 v6.4s, v12.4s, v31.4s
+ sub x24, x24, #4
+ zip1 v4.2d, v0.2d, v1.2d
+ cmp x24, #3
+ zip2 v7.4s, v29.4s, v30.4s
+ zip1 v5.2d, v2.2d, v3.2d
+ zip2 v0.2d, v0.2d, v1.2d
+ zip2 v1.2d, v2.2d, v3.2d
+ zip2 v2.4s, v28.4s, v23.4s
+ zip2 v3.4s, v13.4s, v25.4s
+ stp q4, q5, [x26]
+ zip2 v4.2d, v6.2d, v7.2d
+ stp q0, q1, [x26, #32]
+ zip1 v0.2d, v6.2d, v7.2d
+ zip1 v1.2d, v2.2d, v3.2d
+ zip2 v2.2d, v2.2d, v3.2d
+ stp q0, q1, [x26, #64]
+ stp q4, q2, [x26, #96]
+ add x26, x26, #128
+ b.ls .LBB3_6
+.LBB3_3:
+ ldr x14, [sp, #40]
+ mov x10, x14
+ add x11, x14, #8
+ add x12, x14, #12
+ add x13, x14, #16
+ ld1r { v12.4s }, [x10], #4
+ ld1r { v29.4s }, [x11]
+ add x11, x14, #20
+ ld1r { v30.4s }, [x12]
+ add x12, x14, #24
+ ld1r { v28.4s }, [x13]
+ ld1r { v23.4s }, [x11]
+ add x11, x14, #28
+ ld1r { v13.4s }, [x12]
+ ld1r { v31.4s }, [x10]
+ ld1r { v25.4s }, [x11]
+ cbz x2, .LBB3_2
ldr q1, [sp, #16]
dup v0.4s, w20
- ldp x13, x14, [x24]
- ldp x15, x16, [x24, #16]
+ lsr x12, x20, #32
+ mov x10, xzr
+ ldp x13, x14, [x0, #16]
add v1.4s, v0.4s, v1.4s
+ mov x15, x2
movi v0.4s, #128, lsl #24
- str q1, [sp, #64]
+ mov w4, w8
+ str q1, [sp, #112]
eor v0.16b, v1.16b, v0.16b
ldr q1, [sp]
- lsr x18, x20, #32
- mov x17, xzr
cmgt v0.4s, v1.4s, v0.4s
- dup v1.4s, w18
+ dup v1.4s, w12
+ ldp x11, x12, [x0]
sub v0.4s, v1.4s, v0.4s
- mov w18, w8
- str q0, [sp, #48]
-.LBB2_4:
- mov w2, #16
- bfi x2, x17, #6, #58
- ldr q1, [x13, x2]
- ldr q3, [x14, x2]
- ldr q2, [x15, x2]
- ldr q4, [x16, x2]
- mov w2, #32
- bfi x2, x17, #6, #58
- ldr q5, [x13, x2]
- ldr q18, [x14, x2]
- ldr q19, [x15, x2]
- ldr q23, [x16, x2]
- mov w2, #48
- lsl x3, x17, #6
- bfi x2, x17, #6, #58
- add x17, x17, #1
- ldr q0, [x13, x3]
- ldr q21, [x14, x3]
- ldr q7, [x15, x3]
- ldr q16, [x16, x3]
- cmp x17, x22
- ldr q13, [x13, x2]
- ldr q14, [x14, x2]
- ldr q29, [x15, x2]
- ldr q10, [x16, x2]
- csel w2, w27, wzr, eq
- orr w18, w2, w18
- mov x0, xzr
- and w18, w18, #0xff
- add x3, x3, #256
-.LBB2_5:
- ldr x2, [x24, x0]
- add x0, x0, #8
- cmp x0, #32
- add x2, x2, x3
- prfm pldl1keep, [x2]
- b.ne .LBB2_5
- dup v22.4s, w18
- str q22, [sp, #192]
- zip1 v27.4s, v0.4s, v21.4s
- zip2 v21.4s, v0.4s, v21.4s
- zip1 v0.4s, v7.4s, v16.4s
- zip2 v22.4s, v7.4s, v16.4s
- zip1 v7.4s, v1.4s, v3.4s
- zip1 v25.4s, v2.4s, v4.4s
- zip2 v16.4s, v2.4s, v4.4s
- zip1 v11.4s, v19.4s, v23.4s
- zip2 v12.4s, v19.4s, v23.4s
- zip1 v19.4s, v13.4s, v14.4s
- zip2 v23.4s, v13.4s, v14.4s
- zip1 v13.4s, v29.4s, v10.4s
- zip2 v14.4s, v29.4s, v10.4s
- add v10.4s, v20.4s, v8.4s
- add v2.4s, v26.4s, v9.4s
- ext v20.16b, v22.16b, v21.16b, #8
- ext v26.16b, v25.16b, v7.16b, #8
- zip2 v24.4s, v1.4s, v3.4s
- add v1.4s, v6.4s, v15.4s
- ext v6.16b, v0.16b, v27.16b, #8
- ext v20.16b, v21.16b, v20.16b, #8
- mov v21.d[1], v22.d[0]
- ext v22.16b, v7.16b, v26.16b, #8
- mov v7.d[1], v25.d[0]
- add v3.4s, v17.4s, v31.4s
- str q1, [sp, #144]
- ext v1.16b, v27.16b, v6.16b, #8
- mov v6.16b, v7.16b
- zip1 v28.4s, v5.4s, v18.4s
- stur q1, [x29, #-80]
- mov v1.16b, v27.16b
- mov v27.16b, v24.16b
- add v3.4s, v3.4s, v6.4s
- ldr q6, [sp, #64]
- ext v29.16b, v16.16b, v24.16b, #8
- mov v1.d[1], v0.d[0]
- ext v0.16b, v11.16b, v28.16b, #8
- mov v27.d[1], v16.d[0]
- ext v16.16b, v14.16b, v23.16b, #8
- stur q7, [x29, #-144]
- ext v7.16b, v24.16b, v29.16b, #8
- ext v29.16b, v28.16b, v0.16b, #8
- ext v0.16b, v23.16b, v16.16b, #8
- mov v23.d[1], v14.d[0]
- stp q0, q23, [sp, #80]
- add v0.4s, v10.4s, v1.4s
- eor v16.16b, v0.16b, v6.16b
- ldr q6, [sp, #48]
- add v2.4s, v2.4s, v21.4s
- mov v28.d[1], v11.d[0]
- zip2 v18.4s, v5.4s, v18.4s
- eor v10.16b, v2.16b, v6.16b
- movi v6.4s, #64
- eor v11.16b, v3.16b, v6.16b
- ldr q6, [sp, #144]
- dup v17.4s, w9
- ext v30.16b, v12.16b, v18.16b, #8
- rev32 v16.8h, v16.8h
- dup v5.4s, w10
- ext v25.16b, v18.16b, v30.16b, #8
- mov v30.16b, v23.16b
- mov v23.16b, v1.16b
- str q1, [sp, #160]
- rev32 v10.8h, v10.8h
- add v1.4s, v16.4s, v17.4s
- add v17.4s, v6.4s, v27.4s
- ldr q6, [sp, #192]
- dup v4.4s, w11
- rev32 v11.8h, v11.8h
- add v5.4s, v10.4s, v5.4s
- eor v8.16b, v1.16b, v8.16b
- stur q21, [x29, #-128]
- mov v18.d[1], v12.d[0]
- add v4.4s, v11.4s, v4.4s
- eor v9.16b, v5.16b, v9.16b
- ushr v12.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- ldur q21, [x29, #-80]
- ext v26.16b, v13.16b, v19.16b, #8
- eor v31.16b, v4.16b, v31.16b
- orr v8.16b, v8.16b, v12.16b
- ushr v12.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- ext v26.16b, v19.16b, v26.16b, #8
- mov v19.d[1], v13.d[0]
- orr v9.16b, v9.16b, v12.16b
- ushr v12.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v17.16b, v6.16b
- orr v31.16b, v31.16b, v12.16b
- dup v12.4s, w12
- rev32 v13.8h, v13.8h
- add v12.4s, v13.4s, v12.4s
- add v0.4s, v0.4s, v21.4s
- eor v14.16b, v12.16b, v15.16b
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v22.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v7.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v13.16b, v17.16b, v13.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v13.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v28.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v18.4s
- orr v14.16b, v14.16b, v15.16b
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v19.4s
- rev32 v13.8h, v13.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v30.4s
- add v4.4s, v4.4s, v13.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- mov v24.16b, v7.16b
- stur q7, [x29, #-112]
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- mov v7.16b, v26.16b
- add v3.4s, v3.4s, v26.4s
- ldr q26, [sp, #80]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v5.4s, v11.4s
- add v0.4s, v0.4s, v29.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v25.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v26.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v13.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- str q22, [sp, #128]
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- ldur q22, [x29, #-128]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- eor v8.16b, v5.16b, v8.16b
- mov v6.16b, v18.16b
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- ldur q18, [x29, #-144]
- orr v8.16b, v8.16b, v15.16b
- add v0.4s, v0.4s, v22.4s
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v24.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v18.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v13.16b, v17.16b, v13.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v13.8h, v13.8h
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v13.4s
- add v0.4s, v0.4s, v27.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v6.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v23.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v7.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v13.16b, v17.16b, v13.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v13.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v21.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v19.4s
- orr v14.16b, v14.16b, v15.16b
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v29.4s
- str q28, [sp, #112]
- rev32 v13.8h, v13.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v26.4s
- add v4.4s, v4.4s, v13.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- ldp q28, q23, [sp, #112]
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ldr q21, [sp, #96]
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v5.4s, v11.4s
- add v0.4s, v0.4s, v25.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v23.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v21.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v28.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v13.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- eor v8.16b, v5.16b, v8.16b
- mov v30.16b, v29.16b
- mov v29.16b, v25.16b
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- ldur q25, [x29, #-112]
- orr v8.16b, v8.16b, v15.16b
- add v0.4s, v0.4s, v20.4s
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v6.4s
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v7.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v25.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v13.16b, v17.16b, v13.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v13.8h, v13.8h
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v13.4s
- add v0.4s, v0.4s, v18.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v12.16b, v14.16b
- add v0.4s, v0.4s, v8.4s
- add v2.4s, v2.4s, v19.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v16.16b, v0.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v22.4s
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v21.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v14.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v13.16b, v17.16b, v13.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v13.4s, #8
- shl v13.4s, v13.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v13.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v14.16b, v12.16b, v14.16b
+ str q0, [sp, #96]
+.LBB3_5:
+ add x17, x11, x10
+ add x21, x12, x10
+ add x16, x13, x10
+ add x6, x14, x10
+ subs x15, x15, #1
+ add x10, x10, #64
+ ldp q0, q1, [x17]
+ csel w3, w27, wzr, eq
+ orr w3, w3, w4
+ mov w4, w19
+ and w3, w3, #0xff
+ ldp q3, q6, [x21]
+ dup v2.4s, w3
+ zip1 v21.4s, v0.4s, v3.4s
+ zip2 v19.4s, v0.4s, v3.4s
+ ldp q5, q7, [x16]
+ zip1 v17.4s, v1.4s, v6.4s
+ zip2 v22.4s, v1.4s, v6.4s
+ ldp q16, q18, [x6]
+ zip1 v4.4s, v5.4s, v16.4s
+ zip2 v0.4s, v5.4s, v16.4s
+ ldp q26, q27, [x17, #32]
+ zip1 v1.4s, v7.4s, v18.4s
+ zip2 v3.4s, v7.4s, v18.4s
+ zip2 v20.2d, v19.2d, v0.2d
+ mov v19.d[1], v0.d[0]
+ dup v18.4s, w9
+ ldp q8, q9, [x21, #32]
+ stur q19, [x29, #-208]
+ zip2 v7.4s, v26.4s, v8.4s
+ zip1 v10.4s, v26.4s, v8.4s
+ ldp q11, q5, [x16, #32]
+ zip2 v26.2d, v17.2d, v1.2d
+ stp q7, q26, [sp, #192]
+ mov v17.d[1], v1.d[0]
+ add v1.4s, v23.4s, v31.4s
+ ldp q16, q6, [x6, #32]
+ stur q17, [x29, #-256]
+ add v1.4s, v1.4s, v19.4s
+ zip1 v8.4s, v11.4s, v16.4s
+ zip2 v7.4s, v11.4s, v16.4s
+ zip1 v11.4s, v27.4s, v9.4s
+ zip2 v9.4s, v27.4s, v9.4s
+ zip2 v27.2d, v21.2d, v4.2d
+ mov v21.d[1], v4.d[0]
+ str q7, [sp, #224]
+ add v4.4s, v28.4s, v12.4s
+ zip1 v15.4s, v5.4s, v6.4s
+ zip2 v14.4s, v5.4s, v6.4s
+ stur q27, [x29, #-192]
+ zip2 v16.2d, v22.2d, v3.2d
+ stp q20, q21, [x29, #-240]
+ add v0.4s, v4.4s, v21.4s
+ ldp q6, q4, [sp, #96]
+ mov v22.d[1], v3.d[0]
+ add v5.4s, v25.4s, v30.4s
+ add v3.4s, v13.4s, v29.4s
+ eor v6.16b, v1.16b, v6.16b
+ add v1.4s, v1.4s, v20.4s
+ str q22, [sp, #256]
+ eor v4.16b, v0.16b, v4.16b
+ add v5.4s, v5.4s, v22.4s
+ add v3.4s, v3.4s, v17.4s
+ ldr q17, [sp, #48]
+ rev32 v6.8h, v6.8h
+ rev32 v4.8h, v4.8h
+ eor v2.16b, v5.16b, v2.16b
+ eor v7.16b, v3.16b, v24.16b
add v0.4s, v0.4s, v27.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #7
- shl v14.4s, v14.4s, #25
- add v0.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v30.4s
- orr v14.16b, v14.16b, v15.16b
- eor v13.16b, v0.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v29.4s
- rev32 v13.8h, v13.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v28.4s
- add v4.4s, v4.4s, v13.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- ldr q24, [sp, #160]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v14.16b, v1.16b, v14.16b
- add v5.4s, v5.4s, v11.4s
- stur q7, [x29, #-64]
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v14.4s, #12
- shl v14.4s, v14.4s, #20
- eor v8.16b, v5.16b, v8.16b
- mov v7.16b, v26.16b
- add v3.4s, v3.4s, v26.4s
- ldur q26, [x29, #-80]
- orr v14.16b, v14.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- add v0.4s, v0.4s, v23.4s
- orr v8.16b, v8.16b, v15.16b
- add v15.4s, v0.4s, v9.4s
- add v2.4s, v2.4s, v24.4s
- eor v0.16b, v15.16b, v13.16b
- add v2.4s, v2.4s, v31.4s
- ushr v13.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v14.4s
- add v17.4s, v17.4s, v26.4s
- orr v0.16b, v0.16b, v13.16b
- ushr v13.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v13.16b
- ushr v13.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v0.4s, v4.4s
- orr v10.16b, v10.16b, v13.16b
- ushr v13.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v13.16b
- ushr v13.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- orr v9.16b, v9.16b, v13.16b
- ushr v13.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- add v1.4s, v10.4s, v1.4s
- orr v31.16b, v31.16b, v13.16b
- eor v13.16b, v1.16b, v14.16b
- add v5.4s, v11.4s, v5.4s
- ushr v14.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v13.16b, v13.16b, v14.16b
- ushr v14.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- stur q6, [x29, #-96]
- orr v8.16b, v8.16b, v14.16b
- add v14.4s, v15.4s, v6.4s
- ldur q6, [x29, #-64]
- mov v18.16b, v19.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v18.4s
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v21.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v6.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v13.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v0.16b, v17.16b, v0.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v0.8h, v0.8h
+ add v21.4s, v4.4s, v17.4s
+ rev32 v31.8h, v2.8h
+ ldr q2, [sp, #80]
+ rev32 v7.8h, v7.8h
+ mov v27.16b, v16.16b
+ eor v17.16b, v21.16b, v28.16b
+ add v29.4s, v6.4s, v2.4s
+ ldr q2, [sp, #64]
+ add v24.4s, v31.4s, v18.4s
str q27, [sp, #176]
- mov v27.16b, v30.16b
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v0.4s
- add v14.4s, v14.4s, v25.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v27.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v20.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v7.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v13.4s
- mov v30.16b, v23.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v0.16b, v17.16b, v0.16b
- add v1.4s, v16.4s, v1.4s
- ldur q23, [x29, #-144]
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v0.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v23.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v29.4s
- orr v13.16b, v13.16b, v15.16b
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v30.4s
- rev32 v0.8h, v0.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
- add v17.4s, v17.4s, v26.4s
- add v4.4s, v4.4s, v0.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- ldur q22, [x29, #-128]
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- ldr q26, [sp, #176]
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v5.4s, v11.4s
- add v14.4s, v14.4s, v24.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v22.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v28.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
+ ushr v19.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ add v30.4s, v7.4s, v2.4s
+ eor v18.16b, v29.16b, v23.16b
+ orr v12.16b, v17.16b, v19.16b
+ eor v17.16b, v30.16b, v13.16b
+ eor v19.16b, v24.16b, v25.16b
+ ushr v23.4s, v18.4s, #12
+ shl v18.4s, v18.4s, #20
+ ushr v25.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ ushr v28.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ orr v13.16b, v18.16b, v23.16b
+ orr v25.16b, v17.16b, v25.16b
+ orr v2.16b, v19.16b, v28.16b
+ add v28.4s, v0.4s, v12.4s
+ add v0.4s, v3.4s, v26.4s
+ add v18.4s, v1.4s, v13.4s
+ add v3.4s, v5.4s, v16.4s
+ eor v1.16b, v28.16b, v4.16b
+ add v17.4s, v0.4s, v25.4s
+ eor v0.16b, v18.16b, v6.16b
+ add v19.4s, v3.4s, v2.4s
+ ushr v16.4s, v1.4s, #8
+ shl v3.4s, v1.4s, #24
+ eor v4.16b, v17.16b, v7.16b
+ ushr v6.4s, v0.4s, #8
+ shl v1.4s, v0.4s, #24
+ eor v5.16b, v19.16b, v31.16b
+ ushr v23.4s, v4.4s, #8
+ shl v4.4s, v4.4s, #24
+ orr v7.16b, v3.16b, v16.16b
+ orr v6.16b, v1.16b, v6.16b
+ ushr v31.4s, v5.4s, #8
+ shl v0.4s, v5.4s, #24
+ orr v5.16b, v4.16b, v23.16b
+ add v4.4s, v7.4s, v21.4s
+ ldr q21, [sp, #192]
+ add v3.4s, v6.4s, v29.4s
+ orr v31.16b, v0.16b, v31.16b
+ add v23.4s, v5.4s, v30.4s
+ eor v0.16b, v4.16b, v12.16b
+ eor v1.16b, v3.16b, v13.16b
+ add v16.4s, v31.4s, v24.4s
+ eor v20.16b, v23.16b, v25.16b
+ ushr v24.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v29.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v30.4s, v20.4s, #7
+ shl v20.4s, v20.4s, #25
+ orr v25.16b, v0.16b, v24.16b
+ orr v0.16b, v1.16b, v29.16b
+ mov v29.16b, v10.16b
+ orr v1.16b, v20.16b, v30.16b
+ mov v20.16b, v10.16b
+ mov v24.16b, v21.16b
+ ldr q20, [sp, #224]
+ mov v29.d[1], v8.d[0]
+ mov v13.16b, v9.16b
+ zip2 v30.2d, v10.2d, v8.2d
+ zip2 v8.2d, v21.2d, v20.2d
+ mov v26.16b, v11.16b
+ mov v24.d[1], v20.d[0]
+ add v20.4s, v28.4s, v29.4s
+ mov v13.d[1], v14.d[0]
+ str q8, [sp, #128]
+ eor v2.16b, v16.16b, v2.16b
+ mov v26.d[1], v15.d[0]
+ str q24, [sp, #192]
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v13.4s
+ ushr v12.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ zip2 v10.2d, v9.2d, v14.2d
+ add v18.4s, v18.4s, v24.4s
add v17.4s, v17.4s, v26.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v0.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- orr v8.16b, v8.16b, v15.16b
- add v14.4s, v14.4s, v18.4s
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v27.4s
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v7.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
+ mov v14.16b, v26.16b
+ eor v26.16b, v20.16b, v31.16b
+ stp q10, q30, [sp, #224]
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v12.16b
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v30.4s
+ zip2 v21.2d, v11.2d, v15.2d
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v10.4s
+ add v20.4s, v20.4s, v0.4s
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v8.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
add v17.4s, v17.4s, v21.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
- add v17.4s, v17.4s, v13.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v0.16b, v17.16b, v0.16b
- add v14.4s, v14.4s, v6.4s
- ldur q6, [x29, #-96]
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v0.8h, v0.8h
- stur q20, [x29, #-160]
- mov v20.16b, v29.16b
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v0.4s
- mov v19.16b, v29.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- mov v19.16b, v28.16b
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ ldp q28, q12, [x29, #-256]
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v12.4s
+ mov v15.16b, v29.16b
+ ldur q29, [x29, #-208]
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ str q15, [sp, #160]
+ add v20.4s, v20.4s, v29.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v27.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v28.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
add v3.4s, v3.4s, v6.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v19.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v3.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v24.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v22.4s
+ add v18.4s, v18.4s, v0.4s
+ mov v9.16b, v30.16b
+ mov v30.16b, v21.16b
+ ldur q21, [x29, #-224]
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ str q30, [sp, #144]
+ add v17.4s, v17.4s, v21.4s
+ ldur q21, [x29, #-192]
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v30.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v20.4s, v20.4s, v21.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v10.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v14.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v9.4s
+ ldr q9, [sp, #208]
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v8.4s
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v15.4s
+ add v20.4s, v20.4s, v0.4s
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v9.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
add v17.4s, v17.4s, v13.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v0.16b, v17.16b, v0.16b
- add v1.4s, v16.4s, v1.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v10.4s, v5.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v0.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v25.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v30.4s
- orr v13.16b, v13.16b, v15.16b
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
- add v3.4s, v3.4s, v24.4s
- rev32 v0.8h, v0.8h
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
- add v17.4s, v17.4s, v26.4s
- mov v29.16b, v27.16b
- add v4.4s, v4.4s, v0.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- ldur q27, [x29, #-160]
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v12.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v17.16b, v11.16b
- ldur q6, [x29, #-80]
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v1.4s, v10.4s
- rev32 v11.8h, v11.8h
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v5.4s, v11.4s
- add v14.4s, v14.4s, v22.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v8.16b, v5.16b, v8.16b
- add v14.4s, v14.4s, v9.4s
- add v2.4s, v2.4s, v27.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v0.16b, v14.16b, v0.16b
- add v2.4s, v2.4s, v31.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v24.4s
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v12.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v30.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v27.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
add v3.4s, v3.4s, v6.4s
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- eor v16.16b, v2.16b, v16.16b
- add v3.4s, v3.4s, v13.4s
- add v17.4s, v17.4s, v23.4s
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v3.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- eor v11.16b, v17.16b, v11.16b
- add v4.4s, v0.4s, v4.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v11.4s, #8
- shl v11.4s, v11.4s, #24
- eor v9.16b, v4.16b, v9.16b
- add v12.4s, v16.4s, v12.4s
- orr v11.16b, v11.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v31.16b, v12.16b, v31.16b
- add v1.4s, v10.4s, v1.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- eor v13.16b, v1.16b, v13.16b
- add v5.4s, v11.4s, v5.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #7
- shl v13.4s, v13.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- orr v8.16b, v8.16b, v15.16b
- add v14.4s, v14.4s, v29.4s
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v20.4s
- mov v28.16b, v7.16b
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- add v3.4s, v3.4s, v19.4s
- rev32 v16.8h, v16.8h
- eor v10.16b, v2.16b, v10.16b
- add v3.4s, v3.4s, v31.4s
- add v17.4s, v17.4s, v28.4s
- add v1.4s, v1.4s, v16.4s
- rev32 v10.8h, v10.8h
- eor v11.16b, v3.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v14.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v28.4s
+ add v18.4s, v18.4s, v0.4s
+ mov v10.16b, v13.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v29.4s
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v10.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ mov v22.16b, v8.16b
+ ldp q8, q28, [sp, #240]
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v20.4s, v20.4s, v28.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v15.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v8.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v22.4s
+ ldur q22, [x29, #-256]
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v9.4s
+ mov v13.16b, v12.16b
+ mov v12.16b, v27.16b
+ mov v27.16b, v9.16b
+ ldur q9, [x29, #-192]
+ mov v21.16b, v15.16b
+ ldr q15, [sp, #224]
+ ushr v11.4s, v1.4s, #12
+ ldur q21, [x29, #-224]
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v9.4s
+ add v20.4s, v20.4s, v0.4s
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v21.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v15.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v14.4s
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v24.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v10.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v30.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ add v3.4s, v3.4s, v6.4s
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v8.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v12.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
add v17.4s, v17.4s, v13.4s
- eor v8.16b, v1.16b, v8.16b
- add v5.4s, v5.4s, v10.4s
- rev32 v11.8h, v11.8h
- eor v0.16b, v17.16b, v0.16b
- ushr v15.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- eor v9.16b, v5.16b, v9.16b
- add v4.4s, v4.4s, v11.4s
- rev32 v0.8h, v0.8h
- orr v8.16b, v8.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v31.16b, v4.16b, v31.16b
- add v12.4s, v12.4s, v0.4s
- add v14.4s, v14.4s, v21.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- eor v13.16b, v12.16b, v13.16b
- add v14.4s, v14.4s, v8.4s
- add v2.4s, v2.4s, v30.4s
- orr v31.16b, v31.16b, v15.16b
- ushr v15.4s, v13.4s, #12
- shl v13.4s, v13.4s, #20
- eor v16.16b, v14.16b, v16.16b
- add v2.4s, v2.4s, v9.4s
- orr v13.16b, v13.16b, v15.16b
- ushr v15.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v10.16b, v2.16b, v10.16b
- orr v16.16b, v16.16b, v15.16b
- ushr v15.4s, v10.4s, #8
- shl v10.4s, v10.4s, #24
- add v3.4s, v3.4s, v18.4s
- orr v10.16b, v10.16b, v15.16b
- add v15.4s, v3.4s, v31.4s
- eor v3.16b, v15.16b, v11.16b
- ushr v11.4s, v3.4s, #8
- shl v3.4s, v3.4s, #24
- orr v11.16b, v3.16b, v11.16b
- add v3.4s, v17.4s, v6.4s
- add v17.4s, v3.4s, v13.4s
- eor v0.16b, v17.16b, v0.16b
- ushr v3.4s, v0.4s, #8
- shl v0.4s, v0.4s, #24
- add v1.4s, v16.4s, v1.4s
- orr v0.16b, v0.16b, v3.16b
- eor v3.16b, v1.16b, v8.16b
- ushr v8.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- add v5.4s, v10.4s, v5.4s
- orr v8.16b, v3.16b, v8.16b
- eor v3.16b, v5.16b, v9.16b
- add v4.4s, v11.4s, v4.4s
- ushr v9.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- eor v31.16b, v4.16b, v31.16b
- mov v7.16b, v23.16b
- mov v23.16b, v28.16b
- mov v28.16b, v6.16b
- orr v3.16b, v3.16b, v9.16b
- ushr v9.4s, v31.4s, #7
- shl v31.4s, v31.4s, #25
- ldur q6, [x29, #-64]
- orr v31.16b, v31.16b, v9.16b
- add v9.4s, v0.4s, v12.4s
- eor v12.16b, v9.16b, v13.16b
- ushr v13.4s, v12.4s, #7
- shl v12.4s, v12.4s, #25
- orr v12.16b, v12.16b, v13.16b
- add v13.4s, v14.4s, v6.4s
- add v13.4s, v13.4s, v3.4s
- eor v0.16b, v13.16b, v0.16b
- add v2.4s, v2.4s, v24.4s
- rev32 v14.8h, v0.8h
- add v0.4s, v2.4s, v31.4s
- add v6.4s, v4.4s, v14.4s
- eor v2.16b, v0.16b, v16.16b
- eor v3.16b, v6.16b, v3.16b
- rev32 v16.8h, v2.8h
- ushr v4.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- add v2.4s, v9.4s, v16.4s
- orr v4.16b, v3.16b, v4.16b
- eor v3.16b, v2.16b, v31.16b
- ushr v31.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- orr v3.16b, v3.16b, v31.16b
- add v31.4s, v15.4s, v22.4s
- add v31.4s, v31.4s, v12.4s
- add v17.4s, v17.4s, v7.4s
- eor v9.16b, v31.16b, v10.16b
- add v17.4s, v17.4s, v8.4s
- rev32 v9.8h, v9.8h
- eor v11.16b, v17.16b, v11.16b
- add v1.4s, v1.4s, v9.4s
- rev32 v11.8h, v11.8h
- eor v10.16b, v1.16b, v12.16b
- add v5.4s, v5.4s, v11.4s
- ushr v12.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v8.16b, v5.16b, v8.16b
- orr v10.16b, v10.16b, v12.16b
- ushr v12.4s, v8.4s, #12
- shl v8.4s, v8.4s, #20
- orr v8.16b, v8.16b, v12.16b
- add v12.4s, v13.4s, v27.4s
- add v12.4s, v12.4s, v4.4s
- eor v13.16b, v12.16b, v14.16b
- ldur q14, [x29, #-96]
- mov v25.16b, v29.16b
- add v29.4s, v12.4s, v20.4s
- add v20.4s, v31.4s, v26.4s
- add v0.4s, v0.4s, v14.4s
- add v0.4s, v0.4s, v3.4s
- eor v16.16b, v0.16b, v16.16b
- add v0.4s, v0.4s, v30.4s
- ldur q30, [x29, #-112]
+ ldr q13, [sp, #160]
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v15.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v20.4s, v20.4s, v22.4s
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v9.4s
+ mov v29.16b, v14.16b
+ ldr q14, [sp, #128]
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v14.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v27.4s
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ add v20.4s, v20.4s, v21.4s
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v28.4s
+ add v20.4s, v20.4s, v0.4s
+ mov v12.16b, v27.16b
+ ldur q27, [x29, #-208]
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v27.4s
+ add v19.4s, v19.4s, v25.4s
+ eor v26.16b, v20.16b, v26.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v13.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v17.4s, v17.4s, v2.4s
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v23.16b, v0.16b
+ orr v6.16b, v6.16b, v11.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ add v4.4s, v6.4s, v4.4s
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ add v18.4s, v18.4s, v8.4s
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v29.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v15.4s
+ eor v6.16b, v6.16b, v18.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v10.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ rev32 v6.8h, v6.8h
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ add v3.4s, v3.4s, v6.4s
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v3.16b, v0.16b
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
+ add v18.4s, v18.4s, v14.4s
+ mov v30.16b, v29.16b
+ mov v29.16b, v15.16b
+ ldr q15, [sp, #144]
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v20.4s, v20.4s, v15.4s
+ add v18.4s, v18.4s, v0.4s
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v24.4s
+ eor v6.16b, v18.16b, v6.16b
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v13.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v6.16b, v6.16b, v11.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v7.16b, v7.16b, v31.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ add v3.4s, v6.4s, v3.4s
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ eor v0.16b, v3.16b, v0.16b
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ mov v9.16b, v28.16b
+ mov v28.16b, v10.16b
+ ldr q10, [sp, #176]
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v0.16b, v0.16b, v11.16b
add v20.4s, v20.4s, v10.4s
- eor v31.16b, v20.16b, v9.16b
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v9.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v12.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v1.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ eor v6.16b, v17.16b, v6.16b
+ rev32 v7.8h, v7.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ rev32 v6.8h, v6.8h
+ add v16.4s, v16.4s, v7.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v4.4s, v6.4s
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ eor v2.16b, v4.16b, v2.16b
+ add v20.4s, v20.4s, v27.4s
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v22.4s
+ mov v9.16b, v22.16b
+ ldur q22, [x29, #-240]
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v22.4s
+ add v19.4s, v19.4s, v25.4s
+ mov v24.16b, v21.16b
+ ldur q21, [x29, #-192]
+ orr v2.16b, v2.16b, v11.16b
+ eor v26.16b, v20.16b, v26.16b
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ ushr v31.4s, v26.4s, #8
+ add v17.4s, v17.4s, v2.4s
+ shl v26.4s, v26.4s, #24
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ orr v26.16b, v26.16b, v31.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ orr v6.16b, v6.16b, v11.16b
+ eor v0.16b, v23.16b, v0.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v6.4s, v4.4s
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ orr v0.16b, v0.16b, v31.16b
+ eor v2.16b, v4.16b, v2.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v8.4s
+ add v18.4s, v18.4s, v14.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
+ add v17.4s, v17.4s, v13.4s
+ add v18.4s, v18.4s, v0.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v29.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v6.16b, v18.16b
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ rev32 v6.8h, v6.8h
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ add v3.4s, v3.4s, v6.4s
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v16.4s, v26.4s
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ ushr v11.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ orr v0.16b, v0.16b, v11.16b
+ ushr v31.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
add v20.4s, v20.4s, v28.4s
+ add v18.4s, v18.4s, v12.4s
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ orr v1.16b, v1.16b, v31.16b
+ add v20.4s, v20.4s, v25.4s
add v17.4s, v17.4s, v30.4s
- add v17.4s, v17.4s, v8.4s
- eor v9.16b, v17.16b, v11.16b
- ushr v28.4s, v13.4s, #8
- shl v11.4s, v13.4s, #24
- orr v28.16b, v11.16b, v28.16b
- ushr v11.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- orr v16.16b, v16.16b, v11.16b
- ushr v11.4s, v31.4s, #8
- shl v31.4s, v31.4s, #24
- add v6.4s, v28.4s, v6.4s
- orr v31.16b, v31.16b, v11.16b
- ushr v11.4s, v9.4s, #8
- shl v9.4s, v9.4s, #24
- add v2.4s, v16.4s, v2.4s
- eor v4.16b, v6.16b, v4.16b
- orr v9.16b, v9.16b, v11.16b
- add v1.4s, v31.4s, v1.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v11.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- add v5.4s, v9.4s, v5.4s
- eor v10.16b, v1.16b, v10.16b
- orr v4.16b, v4.16b, v11.16b
- ushr v11.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- eor v8.16b, v5.16b, v8.16b
- orr v3.16b, v3.16b, v11.16b
- ushr v11.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- orr v10.16b, v10.16b, v11.16b
- ushr v11.4s, v8.4s, #7
- shl v8.4s, v8.4s, #25
- orr v8.16b, v8.16b, v11.16b
- add v29.4s, v29.4s, v8.4s
- eor v16.16b, v29.16b, v16.16b
- add v0.4s, v0.4s, v4.4s
- mov v12.16b, v26.16b
- add v17.4s, v17.4s, v19.4s
- add v26.4s, v29.4s, v23.4s
- eor v29.16b, v0.16b, v31.16b
- add v20.4s, v20.4s, v3.4s
- rev32 v16.8h, v16.8h
- stur q18, [x29, #-176]
- mov v18.16b, v27.16b
- add v0.4s, v0.4s, v24.4s
- eor v27.16b, v20.16b, v9.16b
- add v17.4s, v17.4s, v10.4s
- rev32 v24.8h, v29.8h
- add v1.4s, v1.4s, v16.4s
+ add v18.4s, v18.4s, v0.4s
+ orr v2.16b, v2.16b, v11.16b
+ add v19.4s, v19.4s, v21.4s
+ eor v7.16b, v20.16b, v7.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v18.16b, v6.16b
+ add v19.4s, v19.4s, v2.4s
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ orr v7.16b, v7.16b, v31.16b
+ eor v26.16b, v19.16b, v26.16b
+ orr v6.16b, v6.16b, v11.16b
+ ushr v31.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ ushr v11.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ add v4.4s, v7.4s, v4.4s
+ orr v5.16b, v5.16b, v31.16b
+ add v3.4s, v6.4s, v3.4s
+ orr v26.16b, v26.16b, v11.16b
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v5.4s, v23.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v26.4s, v16.4s
+ ushr v31.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ ushr v11.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v1.16b, v23.16b, v1.16b
+ orr v25.16b, v25.16b, v31.16b
+ eor v2.16b, v16.16b, v2.16b
+ orr v0.16b, v0.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v20.4s, v20.4s, v15.4s
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v1.16b, v1.16b, v31.16b
+ add v18.4s, v18.4s, v24.4s
+ add v20.4s, v20.4s, v0.4s
+ add v19.4s, v19.4s, v9.4s
+ mov v8.16b, v13.16b
+ ldur q13, [x29, #-208]
+ orr v2.16b, v2.16b, v11.16b
+ add v18.4s, v18.4s, v1.4s
+ add v17.4s, v17.4s, v13.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v19.4s, v19.4s, v25.4s
+ eor v7.16b, v18.16b, v7.16b
+ add v17.4s, v17.4s, v2.4s
+ rev32 v26.8h, v26.8h
+ eor v5.16b, v19.16b, v5.16b
+ rev32 v7.8h, v7.8h
+ eor v6.16b, v17.16b, v6.16b
+ add v23.4s, v23.4s, v26.4s
+ rev32 v5.8h, v5.8h
+ add v16.4s, v16.4s, v7.4s
+ rev32 v6.8h, v6.8h
+ eor v0.16b, v23.16b, v0.16b
+ add v3.4s, v3.4s, v5.4s
+ eor v1.16b, v16.16b, v1.16b
+ add v4.4s, v4.4s, v6.4s
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v25.16b, v3.16b, v25.16b
+ ushr v11.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ orr v0.16b, v0.16b, v31.16b
+ eor v2.16b, v4.16b, v2.16b
+ ushr v31.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ orr v1.16b, v1.16b, v11.16b
+ ushr v11.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v20.4s, v20.4s, v22.4s
+ orr v25.16b, v25.16b, v31.16b
+ add v19.4s, v19.4s, v10.4s
+ mov v27.16b, v12.16b
+ mov v12.16b, v30.16b
+ mov v29.16b, v21.16b
+ mov v21.16b, v24.16b
+ ldr q24, [sp, #192]
+ mov v30.16b, v22.16b
+ ldr q22, [sp, #256]
+ orr v2.16b, v2.16b, v11.16b
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v24.4s
+ add v19.4s, v19.4s, v25.4s
+ add v17.4s, v17.4s, v22.4s
+ eor v26.16b, v20.16b, v26.16b
+ add v18.4s, v18.4s, v1.4s
+ eor v5.16b, v19.16b, v5.16b
+ add v17.4s, v17.4s, v2.4s
+ ushr v31.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ ushr v11.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ eor v7.16b, v18.16b, v7.16b
+ eor v6.16b, v17.16b, v6.16b
+ orr v26.16b, v26.16b, v31.16b
+ orr v5.16b, v5.16b, v11.16b
+ ushr v31.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v11.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ add v23.4s, v26.4s, v23.4s
+ orr v7.16b, v7.16b, v31.16b
+ add v3.4s, v5.4s, v3.4s
+ orr v6.16b, v6.16b, v11.16b
+ eor v0.16b, v23.16b, v0.16b
+ add v16.4s, v7.4s, v16.4s
+ eor v25.16b, v3.16b, v25.16b
+ add v4.4s, v6.4s, v4.4s
+ ushr v31.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v11.4s, v25.4s, #7
+ shl v25.4s, v25.4s, #25
+ eor v1.16b, v16.16b, v1.16b
+ eor v2.16b, v4.16b, v2.16b
+ orr v0.16b, v0.16b, v31.16b
+ orr v25.16b, v25.16b, v11.16b
+ ushr v31.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v11.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v20.4s, v20.4s, v14.4s
+ add v18.4s, v18.4s, v27.4s
+ ldr q27, [sp, #224]
+ orr v1.16b, v1.16b, v31.16b
+ orr v2.16b, v2.16b, v11.16b
add v20.4s, v20.4s, v25.4s
- eor v25.16b, v17.16b, v28.16b
- rev32 v27.8h, v27.8h
- add v5.4s, v5.4s, v24.4s
- eor v28.16b, v1.16b, v8.16b
- rev32 v25.8h, v25.8h
- add v6.4s, v6.4s, v27.4s
- eor v4.16b, v5.16b, v4.16b
- ushr v31.4s, v28.4s, #12
- shl v28.4s, v28.4s, #20
- add v2.4s, v2.4s, v25.4s
- eor v3.16b, v6.16b, v3.16b
- orr v28.16b, v28.16b, v31.16b
- ushr v31.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- eor v29.16b, v2.16b, v10.16b
- orr v4.16b, v4.16b, v31.16b
- ushr v31.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- add v26.4s, v26.4s, v28.4s
- orr v3.16b, v3.16b, v31.16b
- ushr v31.4s, v29.4s, #12
- shl v29.4s, v29.4s, #20
- eor v16.16b, v26.16b, v16.16b
- add v0.4s, v0.4s, v4.4s
- add v17.4s, v17.4s, v12.4s
- orr v29.16b, v29.16b, v31.16b
- eor v24.16b, v0.16b, v24.16b
- add v0.4s, v0.4s, v22.4s
- add v20.4s, v20.4s, v3.4s
- ushr v22.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- add v23.4s, v26.4s, v21.4s
- eor v21.16b, v20.16b, v27.16b
add v17.4s, v17.4s, v29.4s
- orr v16.16b, v16.16b, v22.16b
- ushr v22.4s, v24.4s, #8
- shl v24.4s, v24.4s, #24
- eor v25.16b, v17.16b, v25.16b
- orr v22.16b, v24.16b, v22.16b
+ add v18.4s, v18.4s, v0.4s
+ add v19.4s, v19.4s, v8.4s
+ eor v7.16b, v7.16b, v20.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v6.16b, v18.16b
+ add v19.4s, v19.4s, v2.4s
+ rev32 v7.8h, v7.8h
+ eor v5.16b, v17.16b, v5.16b
+ rev32 v6.8h, v6.8h
+ eor v26.16b, v19.16b, v26.16b
+ add v4.4s, v4.4s, v7.4s
+ rev32 v5.8h, v5.8h
+ add v3.4s, v3.4s, v6.4s
+ rev32 v26.8h, v26.8h
+ eor v25.16b, v4.16b, v25.16b
+ add v23.4s, v23.4s, v5.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v16.4s, v26.4s
+ ushr v29.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ ushr v31.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v1.16b, v23.16b, v1.16b
+ eor v2.16b, v16.16b, v2.16b
+ orr v25.16b, v25.16b, v29.16b
+ orr v0.16b, v0.16b, v31.16b
+ ushr v29.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ ushr v31.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ add v18.4s, v18.4s, v21.4s
+ ldr q21, [sp, #240]
+ add v20.4s, v20.4s, v27.4s
+ prfm pldl1keep, [x17, #256]
+ orr v1.16b, v1.16b, v29.16b
+ prfm pldl1keep, [x21, #256]
+ orr v2.16b, v2.16b, v31.16b
+ prfm pldl1keep, [x16, #256]
+ add v18.4s, v18.4s, v0.4s
+ prfm pldl1keep, [x6, #256]
+ add v17.4s, v17.4s, v21.4s
+ add v19.4s, v19.4s, v22.4s
+ add v20.4s, v20.4s, v25.4s
+ eor v6.16b, v18.16b, v6.16b
+ add v17.4s, v17.4s, v1.4s
+ add v19.4s, v19.4s, v2.4s
+ eor v7.16b, v20.16b, v7.16b
+ ushr v22.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ eor v5.16b, v17.16b, v5.16b
+ eor v26.16b, v19.16b, v26.16b
+ ushr v21.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ orr v6.16b, v6.16b, v22.16b
+ ushr v22.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
+ ushr v29.4s, v26.4s, #8
+ shl v26.4s, v26.4s, #24
+ orr v7.16b, v7.16b, v21.16b
+ orr v5.16b, v5.16b, v22.16b
+ add v3.4s, v6.4s, v3.4s
+ orr v21.16b, v26.16b, v29.16b
+ add v4.4s, v7.4s, v4.4s
+ add v22.4s, v5.4s, v23.4s
+ eor v0.16b, v3.16b, v0.16b
+ add v16.4s, v21.4s, v16.4s
+ eor v23.16b, v4.16b, v25.16b
+ eor v1.16b, v22.16b, v1.16b
+ ushr v25.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v2.16b, v16.16b, v2.16b
+ ushr v26.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ orr v0.16b, v0.16b, v25.16b
+ ushr v25.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v29.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ add v20.4s, v20.4s, v28.4s
+ orr v23.16b, v23.16b, v26.16b
+ orr v1.16b, v1.16b, v25.16b
+ orr v2.16b, v2.16b, v29.16b
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v13.4s
+ add v17.4s, v17.4s, v30.4s
+ add v19.4s, v19.4s, v10.4s
+ eor v21.16b, v20.16b, v21.16b
+ add v18.4s, v18.4s, v1.4s
+ add v17.4s, v17.4s, v2.4s
+ add v19.4s, v19.4s, v23.4s
+ rev32 v21.8h, v21.8h
+ eor v7.16b, v18.16b, v7.16b
+ eor v6.16b, v17.16b, v6.16b
+ eor v5.16b, v19.16b, v5.16b
+ add v22.4s, v22.4s, v21.4s
+ rev32 v7.8h, v7.8h
+ rev32 v6.8h, v6.8h
+ rev32 v5.8h, v5.8h
+ eor v0.16b, v22.16b, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ add v4.4s, v4.4s, v6.4s
+ add v3.4s, v3.4s, v5.4s
+ ushr v25.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v1.16b, v16.16b, v1.16b
+ eor v2.16b, v4.16b, v2.16b
+ eor v23.16b, v3.16b, v23.16b
+ orr v0.16b, v0.16b, v25.16b
+ ushr v25.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ ushr v26.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ushr v27.4s, v23.4s, #12
+ shl v23.4s, v23.4s, #20
+ orr v1.16b, v1.16b, v25.16b
+ add v20.4s, v20.4s, v24.4s
+ orr v2.16b, v2.16b, v26.16b
+ orr v23.16b, v23.16b, v27.16b
+ add v18.4s, v18.4s, v12.4s
+ add v17.4s, v17.4s, v9.4s
+ add v19.4s, v19.4s, v15.4s
+ add v20.4s, v20.4s, v0.4s
+ add v18.4s, v18.4s, v1.4s
+ add v17.4s, v17.4s, v2.4s
+ add v19.4s, v19.4s, v23.4s
+ eor v21.16b, v20.16b, v21.16b
+ eor v7.16b, v18.16b, v7.16b
+ eor v6.16b, v17.16b, v6.16b
+ eor v5.16b, v19.16b, v5.16b
ushr v24.4s, v21.4s, #8
shl v21.4s, v21.4s, #24
+ ushr v25.4s, v7.4s, #8
+ shl v7.4s, v7.4s, #24
+ ushr v26.4s, v6.4s, #8
+ shl v6.4s, v6.4s, #24
+ ushr v27.4s, v5.4s, #8
+ shl v5.4s, v5.4s, #24
orr v21.16b, v21.16b, v24.16b
- ushr v24.4s, v25.4s, #8
- shl v25.4s, v25.4s, #24
- add v1.4s, v16.4s, v1.4s
- orr v24.16b, v25.16b, v24.16b
- add v5.4s, v22.4s, v5.4s
- eor v25.16b, v1.16b, v28.16b
- add v6.4s, v21.4s, v6.4s
- eor v4.16b, v5.16b, v4.16b
- ushr v27.4s, v25.4s, #7
- shl v25.4s, v25.4s, #25
- add v2.4s, v24.4s, v2.4s
- eor v3.16b, v6.16b, v3.16b
- orr v25.16b, v25.16b, v27.16b
- ushr v27.4s, v4.4s, #7
- shl v4.4s, v4.4s, #25
- ldur q19, [x29, #-176]
- eor v26.16b, v2.16b, v29.16b
- orr v4.16b, v4.16b, v27.16b
- ushr v27.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- orr v3.16b, v3.16b, v27.16b
- ushr v27.4s, v26.4s, #7
- shl v26.4s, v26.4s, #25
- add v20.4s, v20.4s, v18.4s
- add v17.4s, v17.4s, v30.4s
- orr v26.16b, v26.16b, v27.16b
- add v0.4s, v0.4s, v3.4s
- eor v16.16b, v0.16b, v16.16b
- add v0.4s, v0.4s, v19.4s
- add v19.4s, v20.4s, v26.4s
- add v17.4s, v17.4s, v25.4s
- eor v20.16b, v19.16b, v22.16b
- add v7.4s, v19.4s, v7.4s
- eor v19.16b, v17.16b, v21.16b
- ldur q21, [x29, #-64]
- add v23.4s, v23.4s, v4.4s
- eor v24.16b, v23.16b, v24.16b
- rev32 v16.8h, v16.8h
- add v17.4s, v17.4s, v21.4s
- rev32 v21.8h, v24.8h
- add v6.4s, v6.4s, v21.4s
- rev32 v20.8h, v20.8h
- add v2.4s, v2.4s, v16.4s
- eor v4.16b, v6.16b, v4.16b
- rev32 v19.8h, v19.8h
- add v1.4s, v1.4s, v20.4s
- eor v3.16b, v2.16b, v3.16b
- ushr v24.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- add v5.4s, v5.4s, v19.4s
- eor v22.16b, v1.16b, v26.16b
- orr v4.16b, v4.16b, v24.16b
- ushr v24.4s, v3.4s, #12
- shl v3.4s, v3.4s, #20
- add v18.4s, v23.4s, v14.4s
- eor v23.16b, v5.16b, v25.16b
- orr v3.16b, v3.16b, v24.16b
- ushr v24.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- orr v22.16b, v22.16b, v24.16b
- ushr v24.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- orr v23.16b, v23.16b, v24.16b
- add v18.4s, v18.4s, v4.4s
- add v0.4s, v0.4s, v3.4s
- add v24.4s, v17.4s, v23.4s
- eor v17.16b, v18.16b, v21.16b
- add v7.4s, v7.4s, v22.4s
- eor v16.16b, v0.16b, v16.16b
- ushr v21.4s, v17.4s, #8
- shl v17.4s, v17.4s, #24
- eor v20.16b, v7.16b, v20.16b
- orr v21.16b, v17.16b, v21.16b
- ushr v17.4s, v16.4s, #8
- shl v16.4s, v16.4s, #24
- eor v19.16b, v24.16b, v19.16b
- orr v16.16b, v16.16b, v17.16b
- ushr v17.4s, v20.4s, #8
- shl v20.4s, v20.4s, #24
- orr v25.16b, v20.16b, v17.16b
- ushr v17.4s, v19.4s, #8
- shl v19.4s, v19.4s, #24
- orr v19.16b, v19.16b, v17.16b
- add v1.4s, v25.4s, v1.4s
- eor v22.16b, v1.16b, v22.16b
- eor v20.16b, v1.16b, v18.16b
- add v1.4s, v19.4s, v5.4s
- eor v26.16b, v1.16b, v0.16b
- add v0.4s, v21.4s, v6.4s
- eor v5.16b, v1.16b, v23.16b
- eor v1.16b, v0.16b, v4.16b
- eor v17.16b, v0.16b, v7.16b
- add v0.4s, v16.4s, v2.4s
- eor v2.16b, v0.16b, v3.16b
- eor v6.16b, v0.16b, v24.16b
- ushr v0.4s, v1.4s, #7
+ orr v7.16b, v7.16b, v25.16b
+ orr v6.16b, v6.16b, v26.16b
+ orr v5.16b, v5.16b, v27.16b
+ add v22.4s, v21.4s, v22.4s
+ add v16.4s, v7.4s, v16.4s
+ add v4.4s, v6.4s, v4.4s
+ add v3.4s, v5.4s, v3.4s
+ eor v0.16b, v22.16b, v0.16b
+ eor v1.16b, v16.16b, v1.16b
+ eor v2.16b, v4.16b, v2.16b
+ eor v23.16b, v3.16b, v23.16b
+ ushr v24.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v25.4s, v1.4s, #7
shl v1.4s, v1.4s, #25
- orr v0.16b, v1.16b, v0.16b
- ushr v1.4s, v2.4s, #7
+ ushr v26.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- orr v1.16b, v2.16b, v1.16b
- ushr v2.4s, v22.4s, #7
- shl v3.4s, v22.4s, #25
- orr v2.16b, v3.16b, v2.16b
- ushr v3.4s, v5.4s, #7
- shl v4.4s, v5.4s, #25
- orr v3.16b, v4.16b, v3.16b
- eor v8.16b, v16.16b, v3.16b
- eor v9.16b, v25.16b, v0.16b
- eor v31.16b, v1.16b, v19.16b
- cmp x17, x22
- eor v15.16b, v2.16b, v21.16b
- mov w18, w19
- b.ne .LBB2_4
-.LBB2_7:
- zip1 v0.4s, v20.4s, v26.4s
- zip2 v1.4s, v20.4s, v26.4s
- zip1 v2.4s, v17.4s, v6.4s
- zip2 v3.4s, v17.4s, v6.4s
- zip1 v4.4s, v8.4s, v9.4s
- zip2 v5.4s, v8.4s, v9.4s
- zip1 v6.4s, v31.4s, v15.4s
- zip2 v7.4s, v31.4s, v15.4s
- add x13, x20, #4
- tst w5, #0x1
- sub x28, x28, #4
- zip1 v16.2d, v0.2d, v2.2d
- zip2 v0.2d, v0.2d, v2.2d
- zip1 v2.2d, v1.2d, v3.2d
- zip2 v1.2d, v1.2d, v3.2d
- zip1 v3.2d, v4.2d, v6.2d
- zip2 v4.2d, v4.2d, v6.2d
- zip1 v6.2d, v5.2d, v7.2d
- zip2 v5.2d, v5.2d, v7.2d
- add x24, x24, #32
- csel x20, x13, x20, ne
- cmp x28, #3
- stp q16, q3, [x26]
- stp q0, q4, [x26, #32]
- stp q2, q6, [x26, #64]
- stp q1, q5, [x26, #96]
- add x26, x26, #128
- b.hi .LBB2_2
-.LBB2_8:
- cbz x28, .LBB2_16
+ ushr v27.4s, v23.4s, #7
+ shl v23.4s, v23.4s, #25
+ orr v0.16b, v0.16b, v24.16b
+ orr v1.16b, v1.16b, v25.16b
+ orr v2.16b, v2.16b, v26.16b
+ orr v23.16b, v23.16b, v27.16b
+ movi v24.4s, #64
+ eor v12.16b, v4.16b, v20.16b
+ eor v31.16b, v18.16b, v3.16b
+ eor v29.16b, v17.16b, v22.16b
+ eor v30.16b, v16.16b, v19.16b
+ eor v28.16b, v7.16b, v23.16b
+ eor v23.16b, v6.16b, v0.16b
+ eor v13.16b, v1.16b, v5.16b
+ eor v25.16b, v2.16b, v21.16b
+ cbnz x15, .LBB3_5
+ b .LBB3_2
+.LBB3_6:
+ cbz x24, .LBB3_14
orr w8, w7, w19
- and x21, x5, #0x1
- stur w8, [x29, #-64]
-.LBB2_10:
+ and x22, x5, #0x1
+ stur w8, [x29, #-192]
+.LBB3_8:
ldr x8, [sp, #40]
- ldr x25, [x24]
- ldur w4, [x29, #-64]
- ldp q1, q0, [x8]
- mov x8, x22
- stp q1, q0, [x29, #-48]
-.LBB2_11:
- subs x23, x8, #1
- b.eq .LBB2_13
- cbnz x8, .LBB2_14
- b .LBB2_15
-.LBB2_13:
- orr w4, w4, w27
-.LBB2_14:
- sub x0, x29, #48
- mov w2, #64
- mov x1, x25
- mov x3, x20
- bl zfs_blake3_compress_in_place_sse2
+ mov x28, x0
+ ldr x25, [x0]
+ mov x23, x2
+ ldur w5, [x29, #-192]
+ ldp q0, q1, [x8]
+ mov x8, x2
+ b .LBB3_11
+.LBB3_9:
+ orr w5, w5, w27
+.LBB3_10:
+ sub x0, x29, #144
+ sub x1, x29, #176
+ mov x2, x25
+ mov w3, #64
+ mov x4, x20
+ bl compress_pre
+ ldp q0, q1, [x29, #-144]
add x25, x25, #64
- mov x8, x23
- mov w4, w19
- b .LBB2_11
-.LBB2_15:
- ldp q0, q1, [x29, #-48]
- add x20, x20, x21
- add x24, x24, #8
- subs x28, x28, #1
- stp q0, q1, [x26], #32
- b.ne .LBB2_10
-.LBB2_16:
- add sp, sp, #384
+ mov x8, x21
+ mov w5, w19
+ ldp q2, q3, [x29, #-112]
+ eor v0.16b, v2.16b, v0.16b
+ eor v1.16b, v3.16b, v1.16b
+.LBB3_11:
+ subs x21, x8, #1
+ stp q0, q1, [x29, #-176]
+ b.eq .LBB3_9
+ cbnz x8, .LBB3_10
+ ldp q1, q0, [x29, #-176]
+ mov x0, x28
+ add x20, x20, x22
+ add x0, x28, #8
+ subs x24, x24, #1
+ mov x2, x23
+ stp q1, q0, [x26], #32
+ b.ne .LBB3_8
+.LBB3_14:
+ add sp, sp, #464
ldp x20, x19, [sp, #144]
ldp x22, x21, [sp, #128]
ldp x24, x23, [sp, #112]
@@ -2442,9 +2052,10 @@ zfs_blake3_hash_many_sse2:
ldp d11, d10, [sp, #32]
ldp d13, d12, [sp, #16]
ldp d15, d14, [sp], #160
+ hint #29
ret
-.Lfunc_end2:
- .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2
+.Lfunc_end3:
+ .size zfs_blake3_hash_many_sse2, .Lfunc_end3-zfs_blake3_hash_many_sse2
.cfi_endproc
.section ".note.GNU-stack","",@progbits
-#endif
+#endif \ No newline at end of file
diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
index a05baec96..c4c2dfc5b 100644
--- a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
+++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S
@@ -22,518 +22,61 @@
/*
* Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
* Copyright (c) 2019-2022 Samuel Neves
- * Copyright (c) 2022 Tino Reichardt <[email protected]>
+ * Copyright (c) 2022-2023 Tino Reichardt <[email protected]>
*
* This is converted assembly: SSE4.1 -> ARMv8-A
* Used tools: SIMDe https://github.com/simd-everywhere/simde
+ *
+ * Should work on FreeBSD, Linux and macOS
+ * see: https://github.com/mcmilk/BLAKE3-tests/blob/master/contrib/simde.sh
*/
#if defined(__aarch64__)
.text
- .section .rodata.cst16,"aM",@progbits,16
- .p2align 4
-.LCPI0_0:
- .byte 2
- .byte 3
- .byte 0
- .byte 1
- .byte 6
- .byte 7
- .byte 4
- .byte 5
- .byte 10
- .byte 11
- .byte 8
- .byte 9
- .byte 14
- .byte 15
- .byte 12
- .byte 13
-.LCPI0_1:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
-.LCPI0_2:
- .byte 1
- .byte 2
- .byte 3
- .byte 0
- .byte 5
- .byte 6
- .byte 7
- .byte 4
- .byte 9
- .byte 10
- .byte 11
- .byte 8
- .byte 13
- .byte 14
- .byte 15
- .byte 12
-.LCPI0_3:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 20
- .byte 21
- .byte 22
- .byte 23
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
-.LCPI0_4:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 4
- .byte 5
- .byte 6
- .byte 7
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
+ .section .note.gnu.property,"a",@note
+ .p2align 3
+ .word 4
+ .word 16
+ .word 5
+ .asciz "GNU"
+ .word 3221225472
+ .word 4
+ .word 3
+ .word 0
+.Lsec_end0:
.text
.globl zfs_blake3_compress_in_place_sse41
.p2align 2
.type zfs_blake3_compress_in_place_sse41,@function
zfs_blake3_compress_in_place_sse41:
.cfi_startproc
- ldp q7, q6, [x0]
- ldp q17, q18, [x1]
- add x12, x1, #32
- ld2 { v4.4s, v5.4s }, [x12]
- lsr x10, x3, #32
- fmov s16, w3
- adrp x13, .LCPI0_0
- adrp x11, .LCPI0_1
- and w8, w2, #0xff
- mov v16.s[1], w10
- ldr q0, [x13, :lo12:.LCPI0_0]
- ldr q20, [x11, :lo12:.LCPI0_1]
- adrp x11, .LCPI0_4
- and w9, w4, #0xff
- ldr q2, [x11, :lo12:.LCPI0_4]
- mov v16.s[2], w8
- uzp1 v21.4s, v17.4s, v18.4s
- add v7.4s, v6.4s, v7.4s
- adrp x12, .LCPI0_3
- mov v16.s[3], w9
- uzp2 v18.4s, v17.4s, v18.4s
- add v7.4s, v7.4s, v21.4s
- ext v17.16b, v5.16b, v5.16b, #12
- ldr q3, [x12, :lo12:.LCPI0_3]
- ext v24.16b, v4.16b, v4.16b, #12
- eor v16.16b, v7.16b, v16.16b
- mov v27.16b, v17.16b
- uzp1 v19.4s, v21.4s, v21.4s
- ext v25.16b, v21.16b, v21.16b, #12
- zip2 v28.4s, v18.4s, v17.4s
- tbl v29.16b, { v16.16b }, v0.16b
- mov v27.s[1], v24.s[2]
- zip1 v23.2d, v17.2d, v18.2d
- ext v19.16b, v19.16b, v21.16b, #8
- add v22.4s, v29.4s, v20.4s
- ext v26.16b, v21.16b, v25.16b, #12
- tbl v20.16b, { v23.16b, v24.16b }, v2.16b
- zip1 v21.4s, v28.4s, v24.4s
- zip1 v23.4s, v24.4s, v28.4s
- uzp2 v19.4s, v19.4s, v18.4s
- eor v24.16b, v22.16b, v6.16b
- ext v25.16b, v20.16b, v20.16b, #12
- ext v6.16b, v23.16b, v21.16b, #8
- add v7.4s, v7.4s, v18.4s
- ext v18.16b, v19.16b, v19.16b, #4
- tbl v16.16b, { v26.16b, v27.16b }, v3.16b
- uzp1 v21.4s, v20.4s, v25.4s
- mov v26.16b, v6.16b
- ext v23.16b, v18.16b, v18.16b, #12
- mov v26.s[1], v21.s[2]
- adrp x10, .LCPI0_2
- ext v25.16b, v18.16b, v23.16b, #12
- uzp1 v23.4s, v18.4s, v18.4s
- ldr q1, [x10, :lo12:.LCPI0_2]
- ext v18.16b, v23.16b, v18.16b, #8
- ushr v23.4s, v24.4s, #12
- shl v24.4s, v24.4s, #20
- orr v23.16b, v24.16b, v23.16b
- add v7.4s, v7.4s, v23.4s
- eor v27.16b, v29.16b, v7.16b
- add v4.4s, v7.4s, v4.4s
- tbl v7.16b, { v25.16b, v26.16b }, v3.16b
- tbl v26.16b, { v27.16b }, v1.16b
- add v22.4s, v22.4s, v26.4s
- uzp2 v18.4s, v18.4s, v16.4s
- eor v23.16b, v23.16b, v22.16b
- ext v5.16b, v18.16b, v18.16b, #4
- ushr v27.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- uzp1 v25.4s, v5.4s, v5.4s
- orr v23.16b, v23.16b, v27.16b
- ext v28.16b, v4.16b, v4.16b, #12
- ext v4.16b, v25.16b, v5.16b, #8
- ext v25.16b, v26.16b, v26.16b, #8
- add v26.4s, v28.4s, v23.4s
- eor v25.16b, v26.16b, v25.16b
- ext v22.16b, v22.16b, v22.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v22.4s, v22.4s, v25.4s
- eor v23.16b, v23.16b, v22.16b
- add v17.4s, v26.4s, v17.4s
- ushr v26.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- orr v23.16b, v23.16b, v26.16b
- add v17.4s, v17.4s, v23.4s
- eor v25.16b, v25.16b, v17.16b
- add v17.4s, v17.4s, v19.4s
- tbl v19.16b, { v25.16b }, v1.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- ext v17.16b, v17.16b, v17.16b, #4
- orr v23.16b, v23.16b, v25.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v17.4s, v17.4s, v23.4s
- eor v19.16b, v17.16b, v19.16b
- ext v22.16b, v22.16b, v22.16b, #12
- tbl v19.16b, { v19.16b }, v0.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- add v17.4s, v17.4s, v16.4s
- orr v23.16b, v23.16b, v25.16b
- add v17.4s, v17.4s, v23.4s
- ext v25.16b, v17.16b, v17.16b, #12
- eor v17.16b, v19.16b, v17.16b
- tbl v17.16b, { v17.16b }, v1.16b
- add v19.4s, v22.4s, v17.4s
- eor v22.16b, v23.16b, v19.16b
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v6.2d, v16.2d
- ushr v23.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- zip2 v24.4s, v16.4s, v6.4s
- tbl v26.16b, { v20.16b, v21.16b }, v2.16b
- orr v22.16b, v22.16b, v23.16b
- zip1 v16.4s, v24.4s, v21.4s
- zip1 v20.4s, v21.4s, v24.4s
- ext v21.16b, v26.16b, v26.16b, #12
- ext v17.16b, v17.16b, v17.16b, #8
- add v25.4s, v25.4s, v22.4s
- ext v16.16b, v20.16b, v16.16b, #8
- uzp1 v21.4s, v26.4s, v21.4s
- eor v26.16b, v25.16b, v17.16b
- ext v19.16b, v19.16b, v19.16b, #4
- tbl v26.16b, { v26.16b }, v0.16b
- mov v29.16b, v16.16b
- add v19.4s, v19.4s, v26.4s
- ext v27.16b, v5.16b, v5.16b, #12
- mov v29.s[1], v21.s[2]
- eor v22.16b, v22.16b, v19.16b
- ext v28.16b, v5.16b, v27.16b, #12
- ushr v27.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v25.4s, v6.4s
- orr v22.16b, v22.16b, v27.16b
- add v6.4s, v6.4s, v22.4s
- eor v26.16b, v26.16b, v6.16b
- add v6.4s, v6.4s, v18.4s
- tbl v18.16b, { v26.16b }, v1.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v22.16b, v22.16b, v26.16b
- ext v18.16b, v18.16b, v18.16b, #8
- add v6.4s, v6.4s, v22.4s
- eor v18.16b, v6.16b, v18.16b
- ext v19.16b, v19.16b, v19.16b, #12
- tbl v18.16b, { v18.16b }, v0.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v6.4s, v7.4s
- orr v22.16b, v22.16b, v26.16b
- add v6.4s, v6.4s, v22.4s
- ext v26.16b, v6.16b, v6.16b, #12
- eor v6.16b, v18.16b, v6.16b
- uzp2 v4.4s, v4.4s, v7.4s
- zip2 v25.4s, v7.4s, v16.4s
- add v26.4s, v26.4s, v21.4s
- zip1 v20.2d, v16.2d, v7.2d
- tbl v6.16b, { v6.16b }, v1.16b
- ext v24.16b, v4.16b, v4.16b, #4
- tbl v27.16b, { v20.16b, v21.16b }, v2.16b
- zip1 v7.4s, v25.4s, v21.4s
- zip1 v20.4s, v21.4s, v25.4s
- add v18.4s, v19.4s, v6.4s
- uzp1 v5.4s, v24.4s, v24.4s
- ext v21.16b, v27.16b, v27.16b, #12
- ext v7.16b, v20.16b, v7.16b, #8
- eor v19.16b, v22.16b, v18.16b
- ext v5.16b, v5.16b, v24.16b, #8
- tbl v17.16b, { v28.16b, v29.16b }, v3.16b
- uzp1 v21.4s, v27.4s, v21.4s
- mov v28.16b, v7.16b
- ushr v22.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v23.16b, v24.16b, v24.16b, #12
- uzp2 v5.4s, v5.4s, v17.4s
- mov v28.s[1], v21.s[2]
- orr v19.16b, v19.16b, v22.16b
- ext v27.16b, v24.16b, v23.16b, #12
- ext v23.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #8
- ext v25.16b, v18.16b, v18.16b, #4
- add v18.4s, v26.4s, v19.4s
- uzp1 v24.4s, v23.4s, v23.4s
- eor v6.16b, v18.16b, v6.16b
- ext v24.16b, v24.16b, v23.16b, #8
- add v16.4s, v18.4s, v16.4s
- tbl v18.16b, { v27.16b, v28.16b }, v3.16b
- tbl v27.16b, { v6.16b }, v0.16b
- uzp2 v6.4s, v24.4s, v18.4s
- add v24.4s, v25.4s, v27.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v25.16b
- add v16.4s, v16.4s, v19.4s
- eor v25.16b, v27.16b, v16.16b
- add v4.4s, v16.4s, v4.4s
- tbl v16.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v16.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v19.16b, v19.16b, v25.16b
- ext v16.16b, v16.16b, v16.16b, #8
- add v4.4s, v4.4s, v19.4s
- eor v16.16b, v4.16b, v16.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v25.16b, { v16.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v16.16b, v19.16b, v24.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v17.4s
- orr v19.16b, v16.16b, v19.16b
- add v27.4s, v4.4s, v19.4s
- eor v25.16b, v25.16b, v27.16b
- tbl v25.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v25.4s
- zip2 v26.4s, v17.4s, v7.4s
- ext v4.16b, v27.16b, v27.16b, #12
- eor v19.16b, v19.16b, v24.16b
- add v28.4s, v4.4s, v21.4s
- zip1 v20.2d, v7.2d, v17.2d
- zip1 v4.4s, v26.4s, v21.4s
- zip1 v17.4s, v21.4s, v26.4s
- ushr v26.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v26.16b
- ext v25.16b, v25.16b, v25.16b, #8
- add v27.4s, v28.4s, v19.4s
- eor v25.16b, v27.16b, v25.16b
- ext v24.16b, v24.16b, v24.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v19.16b, v19.16b, v24.16b
- add v7.4s, v27.4s, v7.4s
- ushr v27.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v27.16b
- add v7.4s, v7.4s, v19.4s
- eor v25.16b, v25.16b, v7.16b
- add v5.4s, v7.4s, v5.4s
- tbl v7.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v19.16b, v19.16b, v25.16b
- ext v7.16b, v7.16b, v7.16b, #8
- add v5.4s, v5.4s, v19.4s
- eor v7.16b, v5.16b, v7.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v7.16b, { v7.16b }, v0.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- tbl v16.16b, { v20.16b, v21.16b }, v2.16b
- add v5.4s, v5.4s, v18.4s
- orr v19.16b, v19.16b, v25.16b
- ext v20.16b, v16.16b, v16.16b, #12
- ext v4.16b, v17.16b, v4.16b, #8
- add v5.4s, v5.4s, v19.4s
- uzp1 v21.4s, v16.4s, v20.4s
- mov v17.16b, v4.16b
- ext v25.16b, v5.16b, v5.16b, #12
- mov v17.s[1], v21.s[2]
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v4.2d, v18.2d
- ext v22.16b, v23.16b, v23.16b, #12
- zip2 v26.4s, v18.4s, v4.4s
- tbl v18.16b, { v20.16b, v21.16b }, v2.16b
- eor v5.16b, v7.16b, v5.16b
- ext v16.16b, v23.16b, v22.16b, #12
- ext v22.16b, v6.16b, v6.16b, #4
- zip1 v27.4s, v26.4s, v21.4s
- zip1 v20.4s, v21.4s, v26.4s
- ext v21.16b, v18.16b, v18.16b, #12
- tbl v5.16b, { v5.16b }, v1.16b
- ext v20.16b, v20.16b, v27.16b, #8
- uzp1 v27.4s, v18.4s, v21.4s
- uzp1 v18.4s, v22.4s, v22.4s
- add v21.4s, v24.4s, v5.4s
- ext v18.16b, v18.16b, v22.16b, #8
- eor v19.16b, v19.16b, v21.16b
- tbl v7.16b, { v16.16b, v17.16b }, v3.16b
- uzp2 v18.4s, v18.4s, v17.4s
- zip2 v16.4s, v16.4s, v20.4s
- ushr v17.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v17.16b, v19.16b, v17.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v19.4s, v25.4s, v17.4s
- eor v5.16b, v19.16b, v5.16b
- ext v21.16b, v21.16b, v21.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
- add v4.4s, v19.4s, v4.4s
- add v19.4s, v21.4s, v5.4s
- eor v17.16b, v17.16b, v19.16b
- ushr v21.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
- orr v17.16b, v17.16b, v21.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- add v4.4s, v4.4s, v6.4s
- add v6.4s, v19.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v17.16b, v17.16b, v19.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #12
- tbl v5.16b, { v5.16b }, v0.16b
- add v6.4s, v6.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
- add v4.4s, v4.4s, v7.4s
- orr v17.16b, v17.16b, v19.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- mov v29.16b, v20.16b
- ext v4.16b, v4.16b, v4.16b, #12
- add v6.4s, v6.4s, v5.4s
- mov v29.s[1], v27.s[2]
- add v4.4s, v4.4s, v27.4s
- zip1 v26.2d, v20.2d, v7.2d
- zip1 v7.4s, v16.4s, v27.4s
- zip1 v16.4s, v27.4s, v16.4s
- eor v17.16b, v17.16b, v6.16b
- ext v7.16b, v16.16b, v7.16b, #8
- ushr v16.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- orr v16.16b, v17.16b, v16.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v16.4s
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
- add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
- ushr v17.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v20.4s
- orr v16.16b, v16.16b, v17.16b
- add v4.4s, v4.4s, v16.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
- add v4.4s, v4.4s, v18.4s
- ushr v17.4s, v16.4s, #7
- shl v16.4s, v16.4s, #25
- ext v23.16b, v22.16b, v22.16b, #12
- ext v4.16b, v4.16b, v4.16b, #4
- orr v16.16b, v16.16b, v17.16b
- ext v28.16b, v22.16b, v23.16b, #12
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v16.4s, v4.4s
- tbl v3.16b, { v28.16b, v29.16b }, v3.16b
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v4.4s, v3.4s
- tbl v4.16b, { v5.16b }, v0.16b
- add v5.4s, v6.4s, v4.4s
- eor v6.16b, v16.16b, v5.16b
- ushr v16.4s, v6.4s, #12
- shl v6.4s, v6.4s, #20
- orr v6.16b, v6.16b, v16.16b
- tbl v2.16b, { v26.16b, v27.16b }, v2.16b
- add v3.4s, v3.4s, v6.4s
- ext v19.16b, v2.16b, v2.16b, #12
- eor v4.16b, v4.16b, v3.16b
- uzp1 v2.4s, v2.4s, v19.4s
- ext v3.16b, v3.16b, v3.16b, #12
- tbl v4.16b, { v4.16b }, v1.16b
- add v2.4s, v3.4s, v2.4s
- add v3.4s, v5.4s, v4.4s
- eor v5.16b, v6.16b, v3.16b
- ushr v6.4s, v5.4s, #7
- shl v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v6.16b
- ext v4.16b, v4.16b, v4.16b, #8
- add v2.4s, v2.4s, v5.4s
- eor v4.16b, v2.16b, v4.16b
- ext v3.16b, v3.16b, v3.16b, #4
- tbl v0.16b, { v4.16b }, v0.16b
- add v3.4s, v3.4s, v0.4s
- eor v4.16b, v5.16b, v3.16b
- ushr v5.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- add v2.4s, v2.4s, v7.4s
- orr v4.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v4.4s
- eor v0.16b, v0.16b, v2.16b
- tbl v0.16b, { v0.16b }, v1.16b
- add v1.4s, v3.4s, v0.4s
- eor v3.16b, v4.16b, v1.16b
- ext v2.16b, v2.16b, v2.16b, #4
- ext v1.16b, v1.16b, v1.16b, #12
- ushr v4.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
- ext v0.16b, v0.16b, v0.16b, #8
- eor v1.16b, v2.16b, v1.16b
- orr v2.16b, v3.16b, v4.16b
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ str x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x19, x0
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x19
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
eor v0.16b, v2.16b, v0.16b
- stp q1, q0, [x0]
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
ret
.Lfunc_end0:
.size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41
@@ -542,6 +85,9 @@ zfs_blake3_compress_in_place_sse41:
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI1_0:
+ .xword -4942790177982912921
+ .xword -6534734903820487822
+.LCPI1_1:
.byte 2
.byte 3
.byte 0
@@ -558,11 +104,6 @@ zfs_blake3_compress_in_place_sse41:
.byte 15
.byte 12
.byte 13
-.LCPI1_1:
- .word 1779033703
- .word 3144134277
- .word 1013904242
- .word 2773480762
.LCPI1_2:
.byte 1
.byte 2
@@ -580,488 +121,497 @@ zfs_blake3_compress_in_place_sse41:
.byte 14
.byte 15
.byte 12
-.LCPI1_3:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 20
- .byte 21
- .byte 22
- .byte 23
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
-.LCPI1_4:
- .byte 0
- .byte 1
- .byte 2
- .byte 3
- .byte 4
- .byte 5
- .byte 6
- .byte 7
- .byte 8
- .byte 9
- .byte 10
- .byte 11
- .byte 28
- .byte 29
- .byte 30
- .byte 31
.text
- .globl zfs_blake3_compress_xof_sse41
.p2align 2
- .type zfs_blake3_compress_xof_sse41,@function
-zfs_blake3_compress_xof_sse41:
+ .type compress_pre,@function
+compress_pre:
.cfi_startproc
- ldp q7, q6, [x0]
- ldp q17, q18, [x1]
- add x12, x1, #32
- ld2 { v4.4s, v5.4s }, [x12]
- lsr x10, x3, #32
- fmov s16, w3
- adrp x13, .LCPI1_0
- adrp x11, .LCPI1_1
- and w8, w2, #0xff
- mov v16.s[1], w10
- ldr q0, [x13, :lo12:.LCPI1_0]
- ldr q20, [x11, :lo12:.LCPI1_1]
- adrp x11, .LCPI1_4
- and w9, w4, #0xff
- ldr q2, [x11, :lo12:.LCPI1_4]
- mov v16.s[2], w8
- uzp1 v21.4s, v17.4s, v18.4s
- add v7.4s, v6.4s, v7.4s
- adrp x12, .LCPI1_3
- mov v16.s[3], w9
- uzp2 v18.4s, v17.4s, v18.4s
- add v7.4s, v7.4s, v21.4s
- ext v17.16b, v5.16b, v5.16b, #12
- ldr q3, [x12, :lo12:.LCPI1_3]
- ext v24.16b, v4.16b, v4.16b, #12
- eor v16.16b, v7.16b, v16.16b
- mov v27.16b, v17.16b
- uzp1 v19.4s, v21.4s, v21.4s
- ext v25.16b, v21.16b, v21.16b, #12
- zip2 v28.4s, v18.4s, v17.4s
- tbl v29.16b, { v16.16b }, v0.16b
- mov v27.s[1], v24.s[2]
- zip1 v23.2d, v17.2d, v18.2d
- ext v19.16b, v19.16b, v21.16b, #8
- add v22.4s, v29.4s, v20.4s
- ext v26.16b, v21.16b, v25.16b, #12
- tbl v20.16b, { v23.16b, v24.16b }, v2.16b
- zip1 v21.4s, v28.4s, v24.4s
- zip1 v23.4s, v24.4s, v28.4s
- uzp2 v19.4s, v19.4s, v18.4s
- eor v24.16b, v22.16b, v6.16b
- ext v25.16b, v20.16b, v20.16b, #12
- ext v6.16b, v23.16b, v21.16b, #8
- add v7.4s, v7.4s, v18.4s
- ext v18.16b, v19.16b, v19.16b, #4
- tbl v16.16b, { v26.16b, v27.16b }, v3.16b
- uzp1 v21.4s, v20.4s, v25.4s
- mov v26.16b, v6.16b
- ext v23.16b, v18.16b, v18.16b, #12
- mov v26.s[1], v21.s[2]
- adrp x10, .LCPI1_2
- ext v25.16b, v18.16b, v23.16b, #12
- uzp1 v23.4s, v18.4s, v18.4s
- ldr q1, [x10, :lo12:.LCPI1_2]
- ext v18.16b, v23.16b, v18.16b, #8
- ushr v23.4s, v24.4s, #12
- shl v24.4s, v24.4s, #20
- orr v23.16b, v24.16b, v23.16b
- add v7.4s, v7.4s, v23.4s
- eor v27.16b, v29.16b, v7.16b
- add v4.4s, v7.4s, v4.4s
- tbl v7.16b, { v25.16b, v26.16b }, v3.16b
- tbl v26.16b, { v27.16b }, v1.16b
- add v22.4s, v22.4s, v26.4s
- uzp2 v18.4s, v18.4s, v16.4s
- eor v23.16b, v23.16b, v22.16b
- ext v5.16b, v18.16b, v18.16b, #4
- ushr v27.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- uzp1 v25.4s, v5.4s, v5.4s
- orr v23.16b, v23.16b, v27.16b
- ext v28.16b, v4.16b, v4.16b, #12
- ext v4.16b, v25.16b, v5.16b, #8
- ext v25.16b, v26.16b, v26.16b, #8
- add v26.4s, v28.4s, v23.4s
- eor v25.16b, v26.16b, v25.16b
- ext v22.16b, v22.16b, v22.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v22.4s, v22.4s, v25.4s
- eor v23.16b, v23.16b, v22.16b
- add v17.4s, v26.4s, v17.4s
- ushr v26.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- orr v23.16b, v23.16b, v26.16b
- add v17.4s, v17.4s, v23.4s
- eor v25.16b, v25.16b, v17.16b
- add v17.4s, v17.4s, v19.4s
- tbl v19.16b, { v25.16b }, v1.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #7
- shl v23.4s, v23.4s, #25
- ext v17.16b, v17.16b, v17.16b, #4
- orr v23.16b, v23.16b, v25.16b
- ext v19.16b, v19.16b, v19.16b, #8
- add v17.4s, v17.4s, v23.4s
- eor v19.16b, v17.16b, v19.16b
- ext v22.16b, v22.16b, v22.16b, #12
- tbl v19.16b, { v19.16b }, v0.16b
- add v22.4s, v22.4s, v19.4s
- eor v23.16b, v23.16b, v22.16b
- ushr v25.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- add v17.4s, v17.4s, v16.4s
- orr v23.16b, v23.16b, v25.16b
- add v17.4s, v17.4s, v23.4s
- ext v25.16b, v17.16b, v17.16b, #12
- eor v17.16b, v19.16b, v17.16b
- tbl v17.16b, { v17.16b }, v1.16b
- add v19.4s, v22.4s, v17.4s
- eor v22.16b, v23.16b, v19.16b
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v6.2d, v16.2d
- ushr v23.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- zip2 v24.4s, v16.4s, v6.4s
- tbl v26.16b, { v20.16b, v21.16b }, v2.16b
- orr v22.16b, v22.16b, v23.16b
- zip1 v16.4s, v24.4s, v21.4s
- zip1 v20.4s, v21.4s, v24.4s
- ext v21.16b, v26.16b, v26.16b, #12
- ext v17.16b, v17.16b, v17.16b, #8
- add v25.4s, v25.4s, v22.4s
- ext v16.16b, v20.16b, v16.16b, #8
- uzp1 v21.4s, v26.4s, v21.4s
- eor v26.16b, v25.16b, v17.16b
- ext v19.16b, v19.16b, v19.16b, #4
- tbl v26.16b, { v26.16b }, v0.16b
- mov v29.16b, v16.16b
- add v19.4s, v19.4s, v26.4s
- ext v27.16b, v5.16b, v5.16b, #12
- mov v29.s[1], v21.s[2]
- eor v22.16b, v22.16b, v19.16b
- ext v28.16b, v5.16b, v27.16b, #12
- ushr v27.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v25.4s, v6.4s
- orr v22.16b, v22.16b, v27.16b
- add v6.4s, v6.4s, v22.4s
- eor v26.16b, v26.16b, v6.16b
- add v6.4s, v6.4s, v18.4s
- tbl v18.16b, { v26.16b }, v1.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #7
- shl v22.4s, v22.4s, #25
- ext v6.16b, v6.16b, v6.16b, #4
- orr v22.16b, v22.16b, v26.16b
- ext v18.16b, v18.16b, v18.16b, #8
- add v6.4s, v6.4s, v22.4s
- eor v18.16b, v6.16b, v18.16b
- ext v19.16b, v19.16b, v19.16b, #12
- tbl v18.16b, { v18.16b }, v0.16b
- add v19.4s, v19.4s, v18.4s
- eor v22.16b, v22.16b, v19.16b
- ushr v26.4s, v22.4s, #12
- shl v22.4s, v22.4s, #20
- add v6.4s, v6.4s, v7.4s
- orr v22.16b, v22.16b, v26.16b
- add v6.4s, v6.4s, v22.4s
- ext v26.16b, v6.16b, v6.16b, #12
- eor v6.16b, v18.16b, v6.16b
- uzp2 v4.4s, v4.4s, v7.4s
- zip2 v25.4s, v7.4s, v16.4s
- add v26.4s, v26.4s, v21.4s
- zip1 v20.2d, v16.2d, v7.2d
- tbl v6.16b, { v6.16b }, v1.16b
- ext v24.16b, v4.16b, v4.16b, #4
- tbl v27.16b, { v20.16b, v21.16b }, v2.16b
- zip1 v7.4s, v25.4s, v21.4s
- zip1 v20.4s, v21.4s, v25.4s
- add v18.4s, v19.4s, v6.4s
- uzp1 v5.4s, v24.4s, v24.4s
- ext v21.16b, v27.16b, v27.16b, #12
- ext v7.16b, v20.16b, v7.16b, #8
- eor v19.16b, v22.16b, v18.16b
- ext v5.16b, v5.16b, v24.16b, #8
- tbl v17.16b, { v28.16b, v29.16b }, v3.16b
- uzp1 v21.4s, v27.4s, v21.4s
- mov v28.16b, v7.16b
- ushr v22.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v23.16b, v24.16b, v24.16b, #12
- uzp2 v5.4s, v5.4s, v17.4s
- mov v28.s[1], v21.s[2]
- orr v19.16b, v19.16b, v22.16b
- ext v27.16b, v24.16b, v23.16b, #12
- ext v23.16b, v5.16b, v5.16b, #4
- ext v6.16b, v6.16b, v6.16b, #8
- ext v25.16b, v18.16b, v18.16b, #4
- add v18.4s, v26.4s, v19.4s
- uzp1 v24.4s, v23.4s, v23.4s
- eor v6.16b, v18.16b, v6.16b
- ext v24.16b, v24.16b, v23.16b, #8
- add v16.4s, v18.4s, v16.4s
- tbl v18.16b, { v27.16b, v28.16b }, v3.16b
- tbl v27.16b, { v6.16b }, v0.16b
- uzp2 v6.4s, v24.4s, v18.4s
- add v24.4s, v25.4s, v27.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v25.16b
- add v16.4s, v16.4s, v19.4s
- eor v25.16b, v27.16b, v16.16b
- add v4.4s, v16.4s, v4.4s
- tbl v16.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v16.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v19.16b, v19.16b, v25.16b
- ext v16.16b, v16.16b, v16.16b, #8
- add v4.4s, v4.4s, v19.4s
- eor v16.16b, v4.16b, v16.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v25.16b, { v16.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v16.16b, v19.16b, v24.16b
- ushr v19.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v17.4s
- orr v19.16b, v16.16b, v19.16b
- add v27.4s, v4.4s, v19.4s
- eor v25.16b, v25.16b, v27.16b
- tbl v25.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v25.4s
- zip2 v26.4s, v17.4s, v7.4s
- ext v4.16b, v27.16b, v27.16b, #12
- eor v19.16b, v19.16b, v24.16b
- add v28.4s, v4.4s, v21.4s
- zip1 v20.2d, v7.2d, v17.2d
- zip1 v4.4s, v26.4s, v21.4s
- zip1 v17.4s, v21.4s, v26.4s
- ushr v26.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v19.16b, v19.16b, v26.16b
- ext v25.16b, v25.16b, v25.16b, #8
- add v27.4s, v28.4s, v19.4s
- eor v25.16b, v27.16b, v25.16b
- ext v24.16b, v24.16b, v24.16b, #4
- tbl v25.16b, { v25.16b }, v0.16b
- add v24.4s, v24.4s, v25.4s
- eor v19.16b, v19.16b, v24.16b
- add v7.4s, v27.4s, v7.4s
- ushr v27.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v19.16b, v19.16b, v27.16b
- add v7.4s, v7.4s, v19.4s
- eor v25.16b, v25.16b, v7.16b
- add v5.4s, v7.4s, v5.4s
- tbl v7.16b, { v25.16b }, v1.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- ext v5.16b, v5.16b, v5.16b, #4
- orr v19.16b, v19.16b, v25.16b
+ hint #34
+ fmov s1, w3
+ movi d0, #0x0000ff000000ff
+ ldr q2, [x1]
+ adrp x8, .LCPI1_0
+ mov v1.s[1], w5
+ str q2, [x0]
+ ldr q4, [x8, :lo12:.LCPI1_0]
+ ldr q5, [x1, #16]
+ adrp x8, .LCPI1_1
+ and v0.8b, v1.8b, v0.8b
+ fmov d1, x4
+ stp q5, q4, [x0, #16]
+ mov v1.d[1], v0.d[0]
+ str q1, [x0, #48]
+ ldp q6, q7, [x2]
+ uzp1 v3.4s, v6.4s, v7.4s
+ add v0.4s, v2.4s, v3.4s
+ uzp2 v2.4s, v6.4s, v7.4s
+ add v16.4s, v0.4s, v5.4s
+ ldr q0, [x8, :lo12:.LCPI1_1]
+ adrp x8, .LCPI1_2
+ eor v1.16b, v16.16b, v1.16b
+ add v7.4s, v16.4s, v2.4s
+ tbl v1.16b, { v1.16b }, v0.16b
+ add v4.4s, v1.4s, v4.4s
+ eor v5.16b, v4.16b, v5.16b
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v6.16b
+ add v6.4s, v7.4s, v5.4s
+ eor v7.16b, v1.16b, v6.16b
+ ldr q1, [x8, :lo12:.LCPI1_2]
+ add x8, x2, #32
+ tbl v7.16b, { v7.16b }, v1.16b
+ ld2 { v16.4s, v17.4s }, [x8]
+ add v4.4s, v4.4s, v7.4s
ext v7.16b, v7.16b, v7.16b, #8
- add v5.4s, v5.4s, v19.4s
- eor v7.16b, v5.16b, v7.16b
- ext v24.16b, v24.16b, v24.16b, #12
- tbl v7.16b, { v7.16b }, v0.16b
- add v24.4s, v24.4s, v7.4s
- eor v19.16b, v19.16b, v24.16b
- ushr v25.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- tbl v16.16b, { v20.16b, v21.16b }, v2.16b
- add v5.4s, v5.4s, v18.4s
- orr v19.16b, v19.16b, v25.16b
- ext v20.16b, v16.16b, v16.16b, #12
- ext v4.16b, v17.16b, v4.16b, #8
- add v5.4s, v5.4s, v19.4s
- uzp1 v21.4s, v16.4s, v20.4s
- mov v17.16b, v4.16b
- ext v25.16b, v5.16b, v5.16b, #12
- mov v17.s[1], v21.s[2]
- add v25.4s, v25.4s, v21.4s
- zip1 v20.2d, v4.2d, v18.2d
- ext v22.16b, v23.16b, v23.16b, #12
- zip2 v26.4s, v18.4s, v4.4s
- tbl v18.16b, { v20.16b, v21.16b }, v2.16b
- eor v5.16b, v7.16b, v5.16b
- ext v16.16b, v23.16b, v22.16b, #12
- ext v22.16b, v6.16b, v6.16b, #4
- zip1 v27.4s, v26.4s, v21.4s
- zip1 v20.4s, v21.4s, v26.4s
- ext v21.16b, v18.16b, v18.16b, #12
- tbl v5.16b, { v5.16b }, v1.16b
- ext v20.16b, v20.16b, v27.16b, #8
- uzp1 v27.4s, v18.4s, v21.4s
- uzp1 v18.4s, v22.4s, v22.4s
- add v21.4s, v24.4s, v5.4s
- ext v18.16b, v18.16b, v22.16b, #8
- eor v19.16b, v19.16b, v21.16b
- tbl v7.16b, { v16.16b, v17.16b }, v3.16b
- uzp2 v18.4s, v18.4s, v17.4s
- zip2 v16.4s, v16.4s, v20.4s
- ushr v17.4s, v19.4s, #7
- shl v19.4s, v19.4s, #25
- orr v17.16b, v19.16b, v17.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v19.4s, v25.4s, v17.4s
- eor v5.16b, v19.16b, v5.16b
- ext v21.16b, v21.16b, v21.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
- add v4.4s, v19.4s, v4.4s
- add v19.4s, v21.4s, v5.4s
- eor v17.16b, v17.16b, v19.16b
- ushr v21.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
- orr v17.16b, v17.16b, v21.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- add v4.4s, v4.4s, v6.4s
- add v6.4s, v19.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- ext v4.16b, v4.16b, v4.16b, #4
- orr v17.16b, v17.16b, v19.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v17.4s
+ add v6.4s, v6.4s, v16.4s
eor v5.16b, v4.16b, v5.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #12
ext v6.16b, v6.16b, v6.16b, #12
- tbl v5.16b, { v5.16b }, v0.16b
+ ushr v18.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v18.16b
+ ext v18.16b, v17.16b, v17.16b, #12
add v6.4s, v6.4s, v5.4s
- eor v17.16b, v17.16b, v6.16b
- ushr v19.4s, v17.4s, #12
- shl v17.4s, v17.4s, #20
+ mov v17.16b, v18.16b
+ eor v7.16b, v7.16b, v6.16b
+ add v6.4s, v6.4s, v18.4s
+ mov v17.s[1], v16.s[2]
+ tbl v7.16b, { v7.16b }, v0.16b
add v4.4s, v4.4s, v7.4s
- orr v17.16b, v17.16b, v19.16b
- add v4.4s, v4.4s, v17.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
- mov v29.16b, v20.16b
+ eor v5.16b, v4.16b, v5.16b
+ ushr v19.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v19.16b
+ uzp1 v19.4s, v3.4s, v3.4s
+ add v6.4s, v6.4s, v5.4s
+ ext v19.16b, v19.16b, v3.16b, #8
+ eor v7.16b, v7.16b, v6.16b
+ uzp2 v19.4s, v19.4s, v2.4s
+ tbl v7.16b, { v7.16b }, v1.16b
+ add v6.4s, v6.4s, v19.4s
+ add v4.4s, v4.4s, v7.4s
+ ext v6.16b, v6.16b, v6.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v5.16b, v4.16b, v5.16b
ext v4.16b, v4.16b, v4.16b, #12
+ ushr v20.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v20.16b
+ ext v20.16b, v3.16b, v3.16b, #12
add v6.4s, v6.4s, v5.4s
- mov v29.s[1], v27.s[2]
- add v4.4s, v4.4s, v27.4s
- zip1 v26.2d, v20.2d, v7.2d
- zip1 v7.4s, v16.4s, v27.4s
- zip1 v16.4s, v27.4s, v16.4s
- eor v17.16b, v17.16b, v6.16b
- ext v7.16b, v16.16b, v7.16b, #8
- ushr v16.4s, v17.4s, #7
- shl v17.4s, v17.4s, #25
- orr v16.16b, v17.16b, v16.16b
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v4.4s, v16.4s
+ ext v3.16b, v3.16b, v20.16b, #12
+ eor v7.16b, v7.16b, v6.16b
+ rev64 v3.4s, v3.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ trn2 v3.4s, v3.4s, v17.4s
+ add v4.4s, v4.4s, v7.4s
+ add v6.4s, v6.4s, v3.4s
eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #4
- tbl v5.16b, { v5.16b }, v0.16b
+ ushr v17.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v17.16b
+ zip1 v17.2d, v18.2d, v2.2d
+ zip2 v2.4s, v2.4s, v18.4s
add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
- ushr v17.4s, v16.4s, #12
- shl v16.4s, v16.4s, #20
- add v4.4s, v4.4s, v20.4s
- orr v16.16b, v16.16b, v17.16b
- add v4.4s, v4.4s, v16.4s
- eor v5.16b, v5.16b, v4.16b
- tbl v5.16b, { v5.16b }, v1.16b
+ mov v17.s[3], v16.s[3]
+ zip1 v18.4s, v2.4s, v16.4s
+ zip1 v2.4s, v16.4s, v2.4s
+ eor v7.16b, v7.16b, v6.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ ext v16.16b, v2.16b, v18.16b, #8
+ tbl v7.16b, { v7.16b }, v1.16b
+ add v20.4s, v4.4s, v7.4s
+ ext v4.16b, v17.16b, v17.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v5.16b, v20.16b, v5.16b
+ uzp1 v4.4s, v17.4s, v4.4s
+ ushr v17.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v6.4s, v6.4s, v4.4s
+ orr v5.16b, v5.16b, v17.16b
+ ext v17.16b, v20.16b, v20.16b, #4
add v6.4s, v6.4s, v5.4s
- eor v16.16b, v16.16b, v6.16b
+ eor v7.16b, v7.16b, v6.16b
+ add v6.4s, v6.4s, v16.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v17.4s, v17.4s, v7.4s
+ eor v5.16b, v17.16b, v5.16b
+ ushr v2.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v2.16b, v5.16b, v2.16b
+ add v5.4s, v6.4s, v2.4s
+ ext v6.16b, v19.16b, v19.16b, #4
+ eor v7.16b, v7.16b, v5.16b
+ uzp1 v18.4s, v6.4s, v6.4s
+ tbl v7.16b, { v7.16b }, v1.16b
+ ext v18.16b, v18.16b, v6.16b, #8
+ add v17.4s, v17.4s, v7.4s
+ uzp2 v18.4s, v18.4s, v3.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v2.16b, v17.16b, v2.16b
+ add v5.4s, v5.4s, v18.4s
+ ext v17.16b, v17.16b, v17.16b, #12
+ ushr v19.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v2.16b, v2.16b, v19.16b
+ ext v19.16b, v6.16b, v6.16b, #12
+ add v5.4s, v5.4s, v2.4s
+ ext v6.16b, v6.16b, v19.16b, #12
+ mov v19.16b, v16.16b
+ eor v7.16b, v7.16b, v5.16b
+ rev64 v6.4s, v6.4s
+ mov v19.s[1], v4.s[2]
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v17.4s, v17.4s, v7.4s
+ eor v20.16b, v17.16b, v2.16b
+ trn2 v2.4s, v6.4s, v19.4s
+ ushr v6.4s, v20.4s, #12
+ shl v19.4s, v20.4s, #20
+ add v5.4s, v5.4s, v2.4s
+ orr v6.16b, v19.16b, v6.16b
+ add v19.4s, v5.4s, v6.4s
+ eor v5.16b, v7.16b, v19.16b
+ zip1 v7.2d, v16.2d, v3.2d
+ zip2 v3.4s, v3.4s, v16.4s
+ tbl v20.16b, { v5.16b }, v1.16b
+ mov v7.s[3], v4.s[3]
+ add v17.4s, v17.4s, v20.4s
+ ext v5.16b, v7.16b, v7.16b, #12
+ eor v6.16b, v17.16b, v6.16b
+ uzp1 v5.4s, v7.4s, v5.4s
+ ext v7.16b, v19.16b, v19.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #4
+ ushr v19.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ add v7.4s, v7.4s, v5.4s
+ orr v6.16b, v6.16b, v19.16b
+ ext v19.16b, v20.16b, v20.16b, #8
+ add v7.4s, v7.4s, v6.4s
+ eor v19.16b, v19.16b, v7.16b
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v16.4s, v17.4s, v19.4s
+ zip1 v17.4s, v3.4s, v4.4s
+ zip1 v3.4s, v4.4s, v3.4s
+ eor v4.16b, v16.16b, v6.16b
+ ext v17.16b, v3.16b, v17.16b, #8
+ ushr v3.4s, v4.4s, #12
+ shl v4.4s, v4.4s, #20
+ add v6.4s, v7.4s, v17.4s
+ orr v3.16b, v4.16b, v3.16b
+ add v4.4s, v6.4s, v3.4s
+ ext v6.16b, v18.16b, v18.16b, #4
+ eor v7.16b, v19.16b, v4.16b
+ uzp1 v18.4s, v6.4s, v6.4s
+ tbl v7.16b, { v7.16b }, v1.16b
+ ext v18.16b, v18.16b, v6.16b, #8
+ add v16.4s, v16.4s, v7.4s
+ uzp2 v18.4s, v18.4s, v2.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v3.16b, v16.16b, v3.16b
add v4.4s, v4.4s, v18.4s
- ushr v17.4s, v16.4s, #7
- shl v16.4s, v16.4s, #25
- ext v23.16b, v22.16b, v22.16b, #12
+ ext v16.16b, v16.16b, v16.16b, #12
+ ushr v19.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
ext v4.16b, v4.16b, v4.16b, #4
- orr v16.16b, v16.16b, v17.16b
- ext v28.16b, v22.16b, v23.16b, #12
- ext v5.16b, v5.16b, v5.16b, #8
- add v4.4s, v16.4s, v4.4s
- tbl v3.16b, { v28.16b, v29.16b }, v3.16b
- eor v5.16b, v4.16b, v5.16b
- ext v6.16b, v6.16b, v6.16b, #12
- add v3.4s, v4.4s, v3.4s
- tbl v4.16b, { v5.16b }, v0.16b
- add v5.4s, v6.4s, v4.4s
- eor v6.16b, v16.16b, v5.16b
- ushr v16.4s, v6.4s, #12
+ orr v3.16b, v3.16b, v19.16b
+ ext v19.16b, v6.16b, v6.16b, #12
+ add v4.4s, v4.4s, v3.4s
+ ext v6.16b, v6.16b, v19.16b, #12
+ mov v19.16b, v17.16b
+ eor v7.16b, v7.16b, v4.16b
+ rev64 v6.4s, v6.4s
+ mov v19.s[1], v5.s[2]
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ eor v20.16b, v16.16b, v3.16b
+ trn2 v3.4s, v6.4s, v19.4s
+ ushr v6.4s, v20.4s, #12
+ shl v19.4s, v20.4s, #20
+ add v4.4s, v4.4s, v3.4s
+ orr v6.16b, v19.16b, v6.16b
+ zip1 v19.2d, v17.2d, v2.2d
+ zip2 v2.4s, v2.4s, v17.4s
+ add v4.4s, v4.4s, v6.4s
+ mov v19.s[3], v5.s[3]
+ zip1 v17.4s, v2.4s, v5.4s
+ zip1 v2.4s, v5.4s, v2.4s
+ eor v7.16b, v7.16b, v4.16b
+ ext v20.16b, v19.16b, v19.16b, #12
+ ext v4.16b, v4.16b, v4.16b, #12
+ ext v2.16b, v2.16b, v17.16b, #8
+ tbl v7.16b, { v7.16b }, v1.16b
+ add v16.4s, v16.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v21.16b, v16.16b, v6.16b
+ uzp1 v6.4s, v19.4s, v20.4s
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v19.4s, v21.4s, #7
+ shl v20.4s, v21.4s, #25
+ add v4.4s, v4.4s, v6.4s
+ orr v19.16b, v20.16b, v19.16b
+ add v4.4s, v4.4s, v19.4s
+ eor v7.16b, v7.16b, v4.16b
+ add v4.4s, v4.4s, v2.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ eor v5.16b, v16.16b, v19.16b
+ ushr v17.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v17.16b
+ ext v17.16b, v18.16b, v18.16b, #4
+ add v4.4s, v4.4s, v5.4s
+ uzp1 v18.4s, v17.4s, v17.4s
+ eor v7.16b, v7.16b, v4.16b
+ ext v18.16b, v18.16b, v17.16b, #8
+ tbl v7.16b, { v7.16b }, v1.16b
+ uzp2 v18.4s, v18.4s, v3.4s
+ add v16.4s, v16.4s, v7.4s
+ add v4.4s, v4.4s, v18.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v5.16b, v16.16b, v5.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ ext v16.16b, v16.16b, v16.16b, #12
+ ushr v19.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v19.16b
+ add v19.4s, v4.4s, v5.4s
+ eor v4.16b, v7.16b, v19.16b
+ ext v7.16b, v17.16b, v17.16b, #12
+ tbl v20.16b, { v4.16b }, v0.16b
+ ext v4.16b, v17.16b, v7.16b, #12
+ mov v7.16b, v2.16b
+ add v16.4s, v16.4s, v20.4s
+ rev64 v4.4s, v4.4s
+ mov v7.s[1], v6.s[2]
+ eor v5.16b, v16.16b, v5.16b
+ trn2 v4.4s, v4.4s, v7.4s
+ ushr v7.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v19.4s, v4.4s
+ zip1 v19.2d, v2.2d, v3.2d
+ zip2 v2.4s, v3.4s, v2.4s
+ orr v5.16b, v5.16b, v7.16b
+ mov v19.s[3], v6.s[3]
+ add v7.4s, v17.4s, v5.4s
+ eor v17.16b, v20.16b, v7.16b
+ ext v20.16b, v19.16b, v19.16b, #12
+ ext v7.16b, v7.16b, v7.16b, #12
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v16.4s, v16.4s, v17.4s
+ ext v17.16b, v17.16b, v17.16b, #8
+ eor v21.16b, v16.16b, v5.16b
+ uzp1 v5.4s, v19.4s, v20.4s
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v19.4s, v21.4s, #7
+ shl v20.4s, v21.4s, #25
+ add v7.4s, v7.4s, v5.4s
+ orr v19.16b, v20.16b, v19.16b
+ add v7.4s, v7.4s, v19.4s
+ eor v17.16b, v17.16b, v7.16b
+ tbl v17.16b, { v17.16b }, v0.16b
+ add v3.4s, v16.4s, v17.4s
+ zip1 v16.4s, v2.4s, v6.4s
+ zip1 v2.4s, v6.4s, v2.4s
+ eor v6.16b, v3.16b, v19.16b
+ ext v16.16b, v2.16b, v16.16b, #8
+ ushr v2.4s, v6.4s, #12
shl v6.4s, v6.4s, #20
- orr v6.16b, v6.16b, v16.16b
- tbl v2.16b, { v26.16b, v27.16b }, v2.16b
- add v3.4s, v3.4s, v6.4s
- ext v19.16b, v2.16b, v2.16b, #12
- eor v4.16b, v4.16b, v3.16b
- uzp1 v2.4s, v2.4s, v19.4s
+ add v7.4s, v7.4s, v16.4s
+ orr v2.16b, v6.16b, v2.16b
+ add v6.4s, v7.4s, v2.4s
+ ext v7.16b, v18.16b, v18.16b, #4
+ eor v17.16b, v17.16b, v6.16b
+ uzp1 v18.4s, v7.4s, v7.4s
+ tbl v17.16b, { v17.16b }, v1.16b
+ ext v18.16b, v18.16b, v7.16b, #8
+ add v3.4s, v3.4s, v17.4s
+ uzp2 v18.4s, v18.4s, v4.4s
+ eor v2.16b, v3.16b, v2.16b
+ add v6.4s, v6.4s, v18.4s
ext v3.16b, v3.16b, v3.16b, #12
- tbl v4.16b, { v4.16b }, v1.16b
- add v2.4s, v3.4s, v2.4s
- add v3.4s, v5.4s, v4.4s
- eor v5.16b, v6.16b, v3.16b
- ushr v6.4s, v5.4s, #7
+ ext v18.16b, v18.16b, v18.16b, #4
+ ushr v19.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v19.16b, v2.16b, v19.16b
+ ext v2.16b, v17.16b, v17.16b, #8
+ ext v17.16b, v7.16b, v7.16b, #12
+ add v6.4s, v6.4s, v19.4s
+ eor v2.16b, v2.16b, v6.16b
+ tbl v20.16b, { v2.16b }, v0.16b
+ ext v2.16b, v7.16b, v17.16b, #12
+ mov v7.16b, v16.16b
+ add v17.4s, v3.4s, v20.4s
+ rev64 v3.4s, v2.4s
+ mov v7.s[1], v5.s[2]
+ eor v19.16b, v17.16b, v19.16b
+ trn2 v3.4s, v3.4s, v7.4s
+ ushr v21.4s, v19.4s, #12
+ shl v19.4s, v19.4s, #20
+ add v6.4s, v6.4s, v3.4s
+ orr v19.16b, v19.16b, v21.16b
+ add v21.4s, v6.4s, v19.4s
+ eor v6.16b, v20.16b, v21.16b
+ zip1 v20.2d, v16.2d, v4.2d
+ zip2 v4.4s, v4.4s, v16.4s
+ tbl v22.16b, { v6.16b }, v1.16b
+ mov v20.s[3], v5.s[3]
+ add v17.4s, v17.4s, v22.4s
+ ext v6.16b, v20.16b, v20.16b, #12
+ eor v19.16b, v17.16b, v19.16b
+ uzp1 v6.4s, v20.4s, v6.4s
+ ext v20.16b, v21.16b, v21.16b, #12
+ ext v17.16b, v17.16b, v17.16b, #4
+ ushr v21.4s, v19.4s, #7
+ shl v19.4s, v19.4s, #25
+ add v20.4s, v20.4s, v6.4s
+ orr v19.16b, v19.16b, v21.16b
+ ext v21.16b, v22.16b, v22.16b, #8
+ add v20.4s, v20.4s, v19.4s
+ eor v21.16b, v21.16b, v20.16b
+ tbl v21.16b, { v21.16b }, v0.16b
+ add v16.4s, v17.4s, v21.4s
+ zip1 v17.4s, v4.4s, v5.4s
+ zip1 v4.4s, v5.4s, v4.4s
+ eor v5.16b, v16.16b, v19.16b
+ ext v4.16b, v4.16b, v17.16b, #8
+ ushr v17.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v19.4s, v20.4s, v4.4s
+ ext v20.16b, v18.16b, v18.16b, #8
+ zip1 v3.2d, v4.2d, v3.2d
+ orr v5.16b, v5.16b, v17.16b
+ zip2 v2.4s, v2.4s, v4.4s
+ uzp2 v7.4s, v20.4s, v7.4s
+ mov v3.s[3], v6.s[3]
+ add v17.4s, v19.4s, v5.4s
+ ext v7.16b, v7.16b, v20.16b, #4
+ eor v19.16b, v21.16b, v17.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ tbl v19.16b, { v19.16b }, v1.16b
+ add v7.4s, v17.4s, v7.4s
+ add v16.4s, v16.4s, v19.4s
+ ext v17.16b, v19.16b, v19.16b, #8
+ ext v19.16b, v18.16b, v18.16b, #12
+ eor v5.16b, v16.16b, v5.16b
+ ext v16.16b, v16.16b, v16.16b, #12
+ ext v18.16b, v18.16b, v19.16b, #12
+ mov v19.16b, v4.16b
+ ushr v20.4s, v5.4s, #7
shl v5.4s, v5.4s, #25
- orr v5.16b, v5.16b, v6.16b
- ext v4.16b, v4.16b, v4.16b, #8
- add v2.4s, v2.4s, v5.4s
- eor v4.16b, v2.16b, v4.16b
- ext v3.16b, v3.16b, v3.16b, #4
- tbl v0.16b, { v4.16b }, v0.16b
- add v3.4s, v3.4s, v0.4s
- eor v4.16b, v5.16b, v3.16b
- ushr v5.4s, v4.4s, #12
- shl v4.4s, v4.4s, #20
- add v2.4s, v2.4s, v7.4s
- orr v4.16b, v4.16b, v5.16b
- add v2.4s, v2.4s, v4.4s
+ rev64 v18.4s, v18.4s
+ mov v19.s[1], v6.s[2]
+ orr v5.16b, v5.16b, v20.16b
+ trn2 v18.4s, v18.4s, v19.4s
+ add v7.4s, v5.4s, v7.4s
+ eor v17.16b, v17.16b, v7.16b
+ add v7.4s, v7.4s, v18.4s
+ ext v18.16b, v3.16b, v3.16b, #12
+ tbl v17.16b, { v17.16b }, v0.16b
+ uzp1 v3.4s, v3.4s, v18.4s
+ add v16.4s, v16.4s, v17.4s
+ eor v5.16b, v16.16b, v5.16b
+ ushr v19.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v5.16b, v5.16b, v19.16b
+ add v7.4s, v7.4s, v5.4s
+ eor v17.16b, v17.16b, v7.16b
+ ext v7.16b, v7.16b, v7.16b, #12
+ tbl v17.16b, { v17.16b }, v1.16b
+ add v3.4s, v7.4s, v3.4s
+ add v16.4s, v16.4s, v17.4s
+ ext v7.16b, v17.16b, v17.16b, #8
+ eor v5.16b, v16.16b, v5.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v18.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v5.16b, v5.16b, v18.16b
+ add v3.4s, v3.4s, v5.4s
+ eor v7.16b, v7.16b, v3.16b
+ tbl v0.16b, { v7.16b }, v0.16b
+ zip1 v7.4s, v2.4s, v6.4s
+ zip1 v2.4s, v6.4s, v2.4s
+ add v4.4s, v16.4s, v0.4s
+ ext v2.16b, v2.16b, v7.16b, #8
+ eor v5.16b, v4.16b, v5.16b
+ add v2.4s, v3.4s, v2.4s
+ ushr v6.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ orr v3.16b, v5.16b, v6.16b
+ add v2.4s, v2.4s, v3.4s
eor v0.16b, v0.16b, v2.16b
- tbl v0.16b, { v0.16b }, v1.16b
- add v1.4s, v3.4s, v0.4s
- eor v3.16b, v4.16b, v1.16b
- ushr v4.4s, v3.4s, #7
- shl v3.4s, v3.4s, #25
ext v2.16b, v2.16b, v2.16b, #4
+ tbl v0.16b, { v0.16b }, v1.16b
+ add v1.4s, v4.4s, v0.4s
ext v0.16b, v0.16b, v0.16b, #8
+ eor v3.16b, v1.16b, v3.16b
ext v1.16b, v1.16b, v1.16b, #12
+ ushr v4.4s, v3.4s, #7
+ shl v3.4s, v3.4s, #25
+ stp q1, q0, [x0, #32]
orr v3.16b, v3.16b, v4.16b
- eor v2.16b, v2.16b, v1.16b
- eor v3.16b, v3.16b, v0.16b
- stp q2, q3, [x5]
- ldr q2, [x0]
- eor v1.16b, v2.16b, v1.16b
- str q1, [x5, #32]
- ldr q1, [x0, #16]
- eor v0.16b, v1.16b, v0.16b
- str q0, [x5, #48]
+ stp q2, q3, [x0]
ret
.Lfunc_end1:
- .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41
+ .size compress_pre, .Lfunc_end1-compress_pre
+ .cfi_endproc
+
+ .globl zfs_blake3_compress_xof_sse41
+ .p2align 2
+ .type zfs_blake3_compress_xof_sse41,@function
+zfs_blake3_compress_xof_sse41:
+ .cfi_startproc
+ hint #25
+ .cfi_negate_ra_state
+ sub sp, sp, #96
+ stp x29, x30, [sp, #64]
+ add x29, sp, #64
+ stp x20, x19, [sp, #80]
+ .cfi_def_cfa w29, 32
+ .cfi_offset w19, -8
+ .cfi_offset w20, -16
+ .cfi_offset w30, -24
+ .cfi_offset w29, -32
+ mov x20, x0
+ mov x19, x5
+ mov w5, w4
+ mov x4, x3
+ mov w3, w2
+ mov x2, x1
+ mov x0, sp
+ mov x1, x20
+ bl compress_pre
+ ldp q0, q1, [sp]
+ ldp q2, q3, [sp, #32]
+ eor v0.16b, v2.16b, v0.16b
+ eor v1.16b, v3.16b, v1.16b
+ ldp x29, x30, [sp, #64]
+ stp q0, q1, [x19]
+ ldr q0, [x20]
+ eor v0.16b, v0.16b, v2.16b
+ str q0, [x19, #32]
+ ldr q0, [x20, #16]
+ eor v0.16b, v0.16b, v3.16b
+ str q0, [x19, #48]
+ ldp x20, x19, [sp, #80]
+ add sp, sp, #96
+ hint #29
+ ret
+.Lfunc_end2:
+ .size zfs_blake3_compress_xof_sse41, .Lfunc_end2-zfs_blake3_compress_xof_sse41
.cfi_endproc
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
-.LCPI2_0:
+.LCPI3_0:
.word 0
.word 1
.word 2
.word 3
-.LCPI2_1:
+.LCPI3_1:
.byte 2
.byte 3
.byte 0
@@ -1078,7 +628,7 @@ zfs_blake3_compress_xof_sse41:
.byte 15
.byte 12
.byte 13
-.LCPI2_2:
+.LCPI3_2:
.byte 1
.byte 2
.byte 3
@@ -1095,25 +645,29 @@ zfs_blake3_compress_xof_sse41:
.byte 14
.byte 15
.byte 12
+.LCPI3_3:
+ .word 1779033703
+ .word 3144134277
+ .word 1013904242
+ .word 2773480762
.text
.globl zfs_blake3_hash_many_sse41
.p2align 2
.type zfs_blake3_hash_many_sse41,@function
zfs_blake3_hash_many_sse41:
.cfi_startproc
- stp d15, d14, [sp, #-160]!
+ hint #34
+ stp d15, d14, [sp, #-144]!
stp d13, d12, [sp, #16]
stp d11, d10, [sp, #32]
stp d9, d8, [sp, #48]
- stp x29, x30, [sp, #64]
- stp x28, x27, [sp, #80]
- stp x26, x25, [sp, #96]
- stp x24, x23, [sp, #112]
- stp x22, x21, [sp, #128]
- stp x20, x19, [sp, #144]
- mov x29, sp
- sub sp, sp, #448
- .cfi_def_cfa w29, 160
+ stp x29, x27, [sp, #64]
+ stp x26, x25, [sp, #80]
+ stp x24, x23, [sp, #96]
+ stp x22, x21, [sp, #112]
+ stp x20, x19, [sp, #128]
+ sub sp, sp, #368
+ .cfi_def_cfa_offset 512
.cfi_offset w19, -8
.cfi_offset w20, -16
.cfi_offset w21, -24
@@ -1123,1341 +677,1722 @@ zfs_blake3_hash_many_sse41:
.cfi_offset w25, -56
.cfi_offset w26, -64
.cfi_offset w27, -72
- .cfi_offset w28, -80
- .cfi_offset w30, -88
- .cfi_offset w29, -96
- .cfi_offset b8, -104
- .cfi_offset b9, -112
- .cfi_offset b10, -120
- .cfi_offset b11, -128
- .cfi_offset b12, -136
- .cfi_offset b13, -144
- .cfi_offset b14, -152
- .cfi_offset b15, -160
- ldr x26, [x29, #168]
- ldrb w27, [x29, #160]
- mov w19, w6
- mov x20, x4
- mov x22, x2
- mov x28, x1
+ .cfi_offset w29, -80
+ .cfi_offset b8, -88
+ .cfi_offset b9, -96
+ .cfi_offset b10, -104
+ .cfi_offset b11, -112
+ .cfi_offset b12, -120
+ .cfi_offset b13, -128
+ .cfi_offset b14, -136
+ .cfi_offset b15, -144
+ ldr x8, [sp, #520]
+ adrp x11, .LCPI3_1
+ ldrb w9, [sp, #512]
+ adrp x10, .LCPI3_2
cmp x1, #4
- mov x24, x0
- str x3, [sp, #40]
- b.lo .LBB2_8
- adrp x11, .LCPI2_0
- ldr q0, [x11, :lo12:.LCPI2_0]
+ b.lo .LBB3_6
+ adrp x12, .LCPI3_0
sbfx w13, w5, #0, #1
+ mov w15, #58983
+ mov w16, #44677
+ movk w15, #27145, lsl #16
+ movk w16, #47975, lsl #16
+ ldr q0, [x12, :lo12:.LCPI3_0]
dup v1.4s, w13
- mov w10, #58983
- mov w11, #44677
- mov w12, #62322
+ movi v13.4s, #64
+ mov w13, #62322
+ mov w14, #62778
+ orr w12, w7, w6
and v0.16b, v1.16b, v0.16b
- mov w13, #62778
- orr w8, w7, w19
- adrp x9, .LCPI2_1
- movk w10, #27145, lsl #16
- movk w11, #47975, lsl #16
- movk w12, #15470, lsl #16
- movk w13, #42319, lsl #16
- str q0, [sp, #16]
+ ldr q1, [x11, :lo12:.LCPI3_1]
+ movk w13, #15470, lsl #16
+ movk w14, #42319, lsl #16
+ dup v14.4s, w15
+ stp q0, q1, [sp, #16]
orr v0.4s, #128, lsl #24
- adrp x14, .LCPI2_2
str q0, [sp]
-.LBB2_2:
- ldr x2, [sp, #40]
- mov x15, x2
- ld1r { v7.4s }, [x15], #4
- add x16, x2, #8
- add x17, x2, #12
- add x18, x2, #16
- add x0, x2, #20
- add x3, x2, #24
- add x2, x2, #28
- ld1r { v6.4s }, [x16]
- ld1r { v17.4s }, [x17]
- ld1r { v10.4s }, [x18]
- ld1r { v11.4s }, [x0]
- ld1r { v19.4s }, [x3]
- ld1r { v18.4s }, [x15]
- ld1r { v16.4s }, [x2]
- cbz x22, .LBB2_7
+ dup v0.4s, w16
+ stp q0, q14, [sp, #48]
+ b .LBB3_3
+.LBB3_2:
+ zip1 v0.4s, v29.4s, v8.4s
+ add x15, x4, #4
+ zip1 v1.4s, v30.4s, v31.4s
+ tst w5, #0x1
+ zip1 v2.4s, v24.4s, v18.4s
+ csel x4, x15, x4, ne
+ zip1 v3.4s, v25.4s, v26.4s
+ add x0, x0, #32
+ zip2 v6.4s, v29.4s, v8.4s
+ sub x1, x1, #4
+ zip1 v4.2d, v0.2d, v1.2d
+ cmp x1, #3
+ zip2 v7.4s, v30.4s, v31.4s
+ zip1 v5.2d, v2.2d, v3.2d
+ zip2 v0.2d, v0.2d, v1.2d
+ zip2 v1.2d, v2.2d, v3.2d
+ zip2 v2.4s, v24.4s, v18.4s
+ zip2 v3.4s, v25.4s, v26.4s
+ stp q4, q5, [x8]
+ zip2 v4.2d, v6.2d, v7.2d
+ stp q0, q1, [x8, #32]
+ zip1 v0.2d, v6.2d, v7.2d
+ zip1 v1.2d, v2.2d, v3.2d
+ zip2 v2.2d, v2.2d, v3.2d
+ stp q0, q1, [x8, #64]
+ stp q4, q2, [x8, #96]
+ add x8, x8, #128
+ b.ls .LBB3_6
+.LBB3_3:
+ mov x15, x3
+ add x16, x3, #8
+ add x17, x3, #12
+ add x19, x3, #16
+ add x20, x3, #20
+ ld1r { v29.4s }, [x15], #4
+ ld1r { v30.4s }, [x16]
+ add x16, x3, #24
+ ld1r { v31.4s }, [x17]
+ add x17, x3, #28
+ ld1r { v24.4s }, [x19]
+ ld1r { v18.4s }, [x20]
+ ld1r { v25.4s }, [x16]
+ ld1r { v8.4s }, [x15]
+ ld1r { v26.4s }, [x17]
+ cbz x2, .LBB3_2
ldr q1, [sp, #16]
- dup v0.4s, w20
- ldp x15, x16, [x24]
- ldp x17, x18, [x24, #16]
+ dup v0.4s, w4
+ lsr x17, x4, #32
+ mov x15, xzr
+ ldp x19, x20, [x0, #16]
add v1.4s, v0.4s, v1.4s
+ mov x21, x2
movi v0.4s, #128, lsl #24
- str q1, [sp, #64]
+ mov w26, w12
+ str q1, [sp, #96]
eor v0.16b, v1.16b, v0.16b
ldr q1, [sp]
- lsr x2, x20, #32
- mov x0, xzr
- mov w6, w8
cmgt v0.4s, v1.4s, v0.4s
- dup v1.4s, w2
+ dup v1.4s, w17
+ ldp x16, x17, [x0]
sub v0.4s, v1.4s, v0.4s
- str q0, [sp, #48]
-.LBB2_4:
- mov w4, #16
- stp q16, q17, [sp, #192]
- bfi x4, x0, #6, #58
- ldr q1, [x15, x4]
- ldr q3, [x16, x4]
- ldr q2, [x17, x4]
- ldr q4, [x18, x4]
- mov w4, #32
- bfi x4, x0, #6, #58
- ldr q5, [x15, x4]
- ldr q20, [x16, x4]
- ldr q21, [x17, x4]
- ldr q22, [x18, x4]
- mov w4, #48
- lsl x3, x0, #6
- bfi x4, x0, #6, #58
- add x0, x0, #1
- ldr q0, [x15, x3]
- ldr q23, [x16, x3]
- ldr q16, [x17, x3]
- ldr q17, [x18, x3]
- cmp x0, x22
- ldr q25, [x15, x4]
- ldr q14, [x16, x4]
- ldr q28, [x17, x4]
- ldr q31, [x18, x4]
- csel w4, w27, wzr, eq
- orr w4, w4, w6
- mov x2, xzr
- and w6, w4, #0xff
- add x3, x3, #256
-.LBB2_5:
- ldr x4, [x24, x2]
- add x2, x2, #8
- cmp x2, #32
- add x4, x4, x3
- prfm pldl1keep, [x4]
- b.ne .LBB2_5
- zip1 v29.4s, v0.4s, v23.4s
- zip2 v23.4s, v0.4s, v23.4s
- zip1 v0.4s, v16.4s, v17.4s
- zip2 v24.4s, v16.4s, v17.4s
- zip1 v9.4s, v1.4s, v3.4s
- zip2 v26.4s, v1.4s, v3.4s
- zip1 v27.4s, v2.4s, v4.4s
- zip2 v17.4s, v2.4s, v4.4s
- zip1 v12.4s, v21.4s, v22.4s
- zip2 v13.4s, v21.4s, v22.4s
- add v2.4s, v7.4s, v10.4s
- add v1.4s, v18.4s, v11.4s
- ext v7.16b, v0.16b, v29.16b, #8
- ext v22.16b, v24.16b, v23.16b, #8
- zip1 v30.4s, v5.4s, v20.4s
- zip2 v20.4s, v5.4s, v20.4s
- stp q1, q2, [sp, #112]
- ext v2.16b, v29.16b, v7.16b, #8
- mov v29.d[1], v0.d[0]
- ext v18.16b, v23.16b, v22.16b, #8
- mov v23.d[1], v24.d[0]
- zip1 v21.4s, v25.4s, v14.4s
- zip2 v4.4s, v25.4s, v14.4s
- zip1 v14.4s, v28.4s, v31.4s
- zip2 v15.4s, v28.4s, v31.4s
- add v8.4s, v6.4s, v19.4s
- ext v28.16b, v27.16b, v9.16b, #8
- ext v31.16b, v17.16b, v26.16b, #8
- stur q2, [x29, #-208]
- mov v7.16b, v29.16b
- ext v0.16b, v12.16b, v30.16b, #8
- stp q23, q29, [x29, #-80]
- mov v2.16b, v19.16b
- ext v19.16b, v13.16b, v20.16b, #8
- mov v29.16b, v9.16b
- ext v25.16b, v9.16b, v28.16b, #8
- mov v29.d[1], v27.d[0]
- ext v24.16b, v26.16b, v31.16b, #8
- mov v26.d[1], v17.d[0]
- ext v17.16b, v15.16b, v4.16b, #8
- ext v27.16b, v30.16b, v0.16b, #8
- ext v0.16b, v20.16b, v19.16b, #8
- stp q0, q25, [sp, #80]
- ext v0.16b, v4.16b, v17.16b, #8
- str q0, [sp, #224]
- ldr q0, [sp, #128]
- mov v6.16b, v23.16b
- mov v22.16b, v4.16b
- ldr q16, [x9, :lo12:.LCPI2_1]
- add v17.4s, v0.4s, v7.4s
- ldr q0, [sp, #112]
- mov v30.d[1], v12.d[0]
- add v7.4s, v8.4s, v29.4s
- mov v20.d[1], v13.d[0]
- add v4.4s, v0.4s, v6.4s
- ldr q0, [sp, #64]
- dup v3.4s, w12
- ext v28.16b, v14.16b, v21.16b, #8
- dup v1.4s, w10
- eor v19.16b, v17.16b, v0.16b
- ldr q0, [sp, #48]
- ext v23.16b, v21.16b, v28.16b, #8
- mov v21.d[1], v14.d[0]
- tbl v14.16b, { v19.16b }, v16.16b
- eor v12.16b, v4.16b, v0.16b
- movi v0.4s, #64
- eor v13.16b, v7.16b, v0.16b
- tbl v13.16b, { v13.16b }, v16.16b
- add v6.4s, v13.4s, v3.4s
- dup v5.4s, w11
- tbl v12.16b, { v12.16b }, v16.16b
- add v1.4s, v14.4s, v1.4s
- eor v9.16b, v6.16b, v2.16b
- ldp q2, q0, [sp, #192]
- add v5.4s, v12.4s, v5.4s
- eor v19.16b, v1.16b, v10.16b
- eor v10.16b, v5.16b, v11.16b
- ushr v11.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v11.16b, v19.16b, v11.16b
- ushr v19.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- mov v22.d[1], v15.d[0]
- orr v10.16b, v10.16b, v19.16b
- ushr v19.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- add v15.4s, v0.4s, v2.4s
- orr v9.16b, v9.16b, v19.16b
- dup v19.4s, w6
- add v15.4s, v15.4s, v26.4s
- eor v19.16b, v15.16b, v19.16b
- tbl v3.16b, { v19.16b }, v16.16b
- dup v19.4s, w13
- add v8.4s, v3.4s, v19.4s
- ldur q31, [x29, #-208]
- eor v19.16b, v8.16b, v2.16b
- ushr v0.4s, v19.4s, #12
- shl v19.4s, v19.4s, #20
- orr v2.16b, v19.16b, v0.16b
- ldr q19, [x14, :lo12:.LCPI2_2]
- add v17.4s, v17.4s, v31.4s
- add v17.4s, v17.4s, v11.4s
- eor v14.16b, v14.16b, v17.16b
- tbl v14.16b, { v14.16b }, v19.16b
- add v1.4s, v1.4s, v14.4s
- eor v11.16b, v1.16b, v11.16b
- add v4.4s, v4.4s, v18.4s
- ushr v0.4s, v11.4s, #7
- shl v11.4s, v11.4s, #25
- add v4.4s, v4.4s, v10.4s
- orr v0.16b, v11.16b, v0.16b
- eor v11.16b, v12.16b, v4.16b
- tbl v11.16b, { v11.16b }, v19.16b
- add v5.4s, v5.4s, v11.4s
- eor v10.16b, v5.16b, v10.16b
- add v7.4s, v7.4s, v25.4s
- ushr v12.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- add v7.4s, v7.4s, v9.4s
- orr v10.16b, v10.16b, v12.16b
- eor v12.16b, v13.16b, v7.16b
- tbl v12.16b, { v12.16b }, v19.16b
- add v6.4s, v6.4s, v12.4s
- eor v9.16b, v6.16b, v9.16b
- ushr v13.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- orr v9.16b, v9.16b, v13.16b
- add v13.4s, v15.4s, v24.4s
- add v13.4s, v13.4s, v2.4s
- eor v3.16b, v3.16b, v13.16b
- tbl v3.16b, { v3.16b }, v19.16b
- add v8.4s, v8.4s, v3.4s
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v30.4s
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v20.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v21.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v22.4s
- mov v28.16b, v26.16b
- stur q26, [x29, #-112]
- mov v26.16b, v18.16b
- mov v18.16b, v24.16b
- stur q24, [x29, #-160]
- add v6.4s, v6.4s, v3.4s
- mov v24.16b, v20.16b
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldr q20, [sp, #80]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- stp q30, q22, [x29, #-192]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- mov v30.16b, v27.16b
- add v17.4s, v17.4s, v27.4s
- ldr q27, [sp, #224]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v20.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
- shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v23.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v27.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- stur q21, [x29, #-144]
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- ldur q21, [x29, #-80]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
+ str q0, [sp, #80]
+.LBB3_5:
+ add x23, x16, x15
+ add x24, x17, x15
+ add x22, x19, x15
+ add x25, x20, x15
+ subs x21, x21, #1
+ add x15, x15, #64
+ ldp q1, q2, [x23]
+ csel w27, w9, wzr, eq
+ orr w26, w27, w26
+ and w26, w26, #0xff
+ ldp q4, q5, [x24]
+ dup v0.4s, w26
+ mov w26, w6
+ zip1 v22.4s, v1.4s, v4.4s
+ zip2 v20.4s, v1.4s, v4.4s
+ ldp q6, q7, [x22]
+ zip1 v17.4s, v2.4s, v5.4s
+ zip2 v23.4s, v2.4s, v5.4s
+ ldp q16, q21, [x25]
+ zip1 v19.4s, v6.4s, v16.4s
+ zip2 v1.4s, v6.4s, v16.4s
+ ldp q27, q28, [x23, #32]
+ zip1 v4.4s, v7.4s, v21.4s
+ zip2 v5.4s, v7.4s, v21.4s
+ zip2 v15.2d, v17.2d, v4.2d
+ ldp q9, q10, [x24, #32]
+ mov v17.d[1], v4.d[0]
+ add v4.4s, v30.4s, v25.4s
+ zip2 v11.2d, v23.2d, v5.2d
+ zip2 v3.4s, v27.4s, v9.4s
+ zip1 v7.4s, v27.4s, v9.4s
+ ldp q12, q6, [x22, #32]
+ mov v23.d[1], v5.d[0]
+ stp q11, q3, [sp, #256]
+ add v5.4s, v31.4s, v26.4s
+ add v4.4s, v4.4s, v17.4s
+ str q23, [sp, #352]
+ ldp q16, q2, [x25, #32]
+ add v5.4s, v5.4s, v23.4s
+ zip1 v3.4s, v12.4s, v16.4s
eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
- shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v21.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v26.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v18.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v29.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
+ zip1 v9.4s, v6.4s, v2.4s
+ zip2 v2.4s, v6.4s, v2.4s
+ stp q7, q3, [sp, #208]
+ zip2 v3.4s, v12.4s, v16.4s
+ zip1 v12.4s, v28.4s, v10.4s
+ zip2 v10.4s, v28.4s, v10.4s
+ stp q17, q2, [sp, #160]
+ zip2 v28.2d, v22.2d, v19.2d
+ mov v22.d[1], v19.d[0]
+ str q3, [sp, #240]
+ add v2.4s, v8.4s, v18.4s
+ eor v16.16b, v4.16b, v13.16b
+ dup v17.4s, w13
+ mov v3.16b, v22.16b
+ stp q22, q28, [sp, #320]
+ zip2 v22.2d, v20.2d, v1.2d
+ mov v20.d[1], v1.d[0]
+ add v1.4s, v29.4s, v24.4s
+ add v4.4s, v4.4s, v15.4s
add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q22, [x29, #-64]
- ushr v15.4s, v0.4s, #12
+ add v2.4s, v2.4s, v20.4s
+ stp q15, q20, [sp, #288]
+ add v1.4s, v1.4s, v3.4s
+ ldr q3, [sp, #96]
+ dup v20.4s, w14
+ mov v23.16b, v22.16b
+ mov v15.16b, v10.16b
+ eor v6.16b, v1.16b, v3.16b
+ ldr q3, [sp, #80]
+ add v1.4s, v1.4s, v28.4s
+ ldr q28, [sp, #272]
+ str q23, [sp, #128]
+ eor v7.16b, v2.16b, v3.16b
+ ldp q27, q3, [sp, #32]
+ add v2.4s, v2.4s, v22.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v7.16b, { v7.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ tbl v0.16b, { v0.16b }, v27.16b
+ add v19.4s, v6.4s, v14.4s
+ add v21.4s, v7.4s, v3.4s
+ add v30.4s, v16.4s, v17.4s
+ add v31.4s, v0.4s, v20.4s
+ eor v24.16b, v19.16b, v24.16b
+ eor v17.16b, v21.16b, v18.16b
+ ushr v18.4s, v24.4s, #12
+ shl v20.4s, v24.4s, #20
+ eor v24.16b, v30.16b, v25.16b
+ eor v25.16b, v31.16b, v26.16b
+ ushr v26.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ ushr v29.4s, v24.4s, #12
+ shl v24.4s, v24.4s, #20
+ ushr v8.4s, v25.4s, #12
+ shl v25.4s, v25.4s, #20
+ orr v3.16b, v20.16b, v18.16b
+ ldr q18, [x10, :lo12:.LCPI3_2]
+ orr v13.16b, v17.16b, v26.16b
+ orr v24.16b, v24.16b, v29.16b
+ orr v14.16b, v25.16b, v8.16b
+ add v8.4s, v1.4s, v3.4s
+ add v29.4s, v2.4s, v13.4s
+ add v17.4s, v4.4s, v24.4s
+ add v20.4s, v5.4s, v14.4s
+ eor v1.16b, v6.16b, v8.16b
+ eor v2.16b, v7.16b, v29.16b
+ eor v4.16b, v16.16b, v17.16b
+ eor v0.16b, v0.16b, v20.16b
+ tbl v25.16b, { v1.16b }, v18.16b
+ tbl v16.16b, { v2.16b }, v18.16b
+ tbl v6.16b, { v4.16b }, v18.16b
+ tbl v4.16b, { v0.16b }, v18.16b
+ add v19.4s, v19.4s, v25.4s
+ add v21.4s, v21.4s, v16.4s
+ add v26.4s, v30.4s, v6.4s
+ add v7.4s, v31.4s, v4.4s
+ eor v0.16b, v19.16b, v3.16b
+ eor v1.16b, v21.16b, v13.16b
+ eor v2.16b, v26.16b, v24.16b
+ eor v3.16b, v7.16b, v14.16b
+ ushr v5.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v24.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v30.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ orr v5.16b, v0.16b, v5.16b
+ orr v0.16b, v1.16b, v24.16b
+ ushr v31.4s, v3.4s, #7
+ orr v2.16b, v2.16b, v30.16b
+ ldp q24, q30, [sp, #208]
+ shl v3.4s, v3.4s, #25
+ zip2 v14.2d, v12.2d, v9.2d
+ mov v22.16b, v24.16b
+ orr v1.16b, v3.16b, v31.16b
+ zip2 v3.2d, v24.2d, v30.2d
+ mov v24.16b, v28.16b
+ mov v22.d[1], v30.d[0]
+ ldr q30, [sp, #240]
+ mov v31.16b, v12.16b
+ stp q22, q14, [sp, #224]
+ mov v24.d[1], v30.d[0]
+ add v12.4s, v8.4s, v22.4s
+ mov v31.d[1], v9.d[0]
+ add v22.4s, v29.4s, v24.4s
+ ldr q29, [sp, #176]
+ zip2 v28.2d, v28.2d, v30.2d
+ mov v9.16b, v24.16b
+ mov v15.d[1], v29.d[0]
+ zip2 v8.2d, v10.2d, v29.2d
+ add v10.4s, v12.4s, v0.4s
+ add v22.4s, v22.4s, v2.4s
+ str q9, [sp, #144]
+ add v20.4s, v20.4s, v15.4s
+ add v17.4s, v17.4s, v31.4s
+ stp q3, q8, [sp, #192]
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v28.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v24.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v22.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v23.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q22, [x29, #-144]
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v3.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v28.4s
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v8.4s
+ orr v1.16b, v1.16b, v12.16b
+ add v17.4s, v17.4s, v14.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v31.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v22.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v30.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v27.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldr q27, [sp, #96]
- mov v21.16b, v26.16b
- stur q26, [x29, #-96]
- mov v28.16b, v31.16b
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldp q31, q26, [x29, #-192]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v20.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v27.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v23.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v11.4s
+ mov v30.16b, v28.16b
+ mov v28.16b, v23.16b
+ ldr q23, [sp, #304]
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ mov v29.16b, v31.16b
+ ldr q31, [sp, #160]
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v23.4s
+ orr v1.16b, v1.16b, v12.16b
+ str q29, [sp, #272]
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v31.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v26.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v31.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- mov v18.16b, v24.16b
- mov v24.16b, v20.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ add v22.4s, v22.4s, v24.4s
+ ldr q24, [sp, #320]
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v24.4s
+ ldr q24, [sp, #352]
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v24.4s
+ ldr q24, [sp, #336]
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v14.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- ldur q20, [x29, #-160]
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v21.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v18.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v23.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v20.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q25, [x29, #-80]
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v10.4s, v10.4s, v24.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v29.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v8.4s
+ ldr q8, [sp, #288]
+ orr v1.16b, v1.16b, v13.16b
+ add v17.4s, v17.4s, v3.4s
+ ldr q3, [sp, #352]
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v29.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v30.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v8.4s
+ mov v24.16b, v30.16b
+ mov v30.16b, v15.16b
+ add v17.4s, v17.4s, v15.4s
+ ldr q15, [sp, #224]
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ str q30, [sp, #176]
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v15.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
add v7.4s, v7.4s, v25.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v26.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- ldur q25, [x29, #-112]
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #7
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v25.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v30.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v24.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v31.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldur q25, [x29, #-64]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldr q31, [sp, #224]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v27.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v25.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v9.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v14.4s
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v28.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v11.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v31.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v28.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v22.4s, v22.4s, v29.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v23.4s
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v31.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v30.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v18.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v26.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v23.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- mov v21.16b, v29.16b
- stur q29, [x29, #-128]
- mov v29.16b, v30.16b
- mov v30.16b, v27.16b
- mov v27.16b, v18.16b
- str q18, [sp, #176]
- eor v0.16b, v0.16b, v1.16b
- mov v18.16b, v22.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ldur q22, [x29, #-96]
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ add v10.4s, v10.4s, v3.4s
+ ldr q3, [sp, #192]
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v3.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v15.4s
+ ldr q15, [sp, #128]
+ orr v1.16b, v1.16b, v13.16b
+ add v17.4s, v17.4s, v24.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v20.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v29.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v22.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v31.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ ldp q23, q11, [sp, #320]
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v8.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v23.4s
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ mov v28.16b, v31.16b
+ mov v31.16b, v8.16b
+ ldr q8, [sp, #208]
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v11.4s
+ orr v1.16b, v1.16b, v12.16b
+ add v17.4s, v17.4s, v8.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v21.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v24.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v30.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v28.4s
- add v6.4s, v6.4s, v3.4s
- mov v22.16b, v24.16b
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldur q24, [x29, #-80]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- mov v21.16b, v30.16b
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldur q30, [x29, #-192]
- mov v20.16b, v29.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- ldur q29, [x29, #-112]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v25.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v24.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v29.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v30.4s
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v9.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v14.4s
+ ldr q14, [sp, #256]
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v30.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v29.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v22.4s, v22.4s, v3.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v15.4s
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v14.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v8.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v18.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v20.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v31.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v26.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v10.4s, v10.4s, v28.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v24.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v11.4s
+ ldr q11, [sp, #304]
+ orr v1.16b, v1.16b, v13.16b
+ add v17.4s, v17.4s, v31.4s
+ ldr q31, [sp, #224]
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- add v17.4s, v17.4s, v23.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v27.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v30.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- ldur q27, [x29, #-160]
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v23.4s
+ ldr q23, [sp, #240]
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v11.4s
+ mov v30.16b, v8.16b
+ mov v8.16b, v24.16b
+ ldr q24, [sp, #352]
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v13.16b
+ str q8, [sp, #112]
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v24.4s
+ orr v1.16b, v1.16b, v12.16b
+ add v17.4s, v17.4s, v31.4s
+ eor v4.16b, v4.16b, v10.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ mov v29.16b, v3.16b
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v27.4s
- mov v28.16b, v25.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v21.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v28.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v29.4s
- mov v25.16b, v31.16b
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- ldur q31, [x29, #-96]
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v13.16b
- ldur q28, [x29, #-208]
- mov v18.16b, v20.16b
- str q20, [sp, #144]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- ldur q20, [x29, #-128]
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- add v17.4s, v17.4s, v24.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v31.4s
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #12
+ eor v5.16b, v21.16b, v5.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v22.4s, v22.4s, v29.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v17.4s, v17.4s, v30.4s
+ ldr q30, [sp, #272]
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v22.4s, v22.4s, v0.4s
+ mov v3.16b, v28.16b
+ ldr q28, [sp, #176]
+ orr v5.16b, v5.16b, v13.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v30.4s
+ orr v1.16b, v1.16b, v12.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v28.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v28.4s
- orr v0.16b, v0.16b, v15.16b
- tbl v3.16b, { v3.16b }, v19.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v13.4s, v13.4s, v20.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v0.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v13.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v1.16b, v2.16b
- add v5.4s, v5.4s, v12.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v2.16b, v2.16b, v15.16b
- ushr v15.4s, v0.4s, #7
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ eor v5.16b, v5.16b, v19.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ add v22.4s, v22.4s, v8.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v9.4s
+ ldr q9, [sp, #320]
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v22.4s, v22.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v17.4s, v17.4s, v2.4s
+ add v10.4s, v10.4s, v23.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v16.16b, v16.16b, v22.16b
+ add v20.4s, v20.4s, v31.4s
+ eor v6.16b, v6.16b, v17.16b
+ add v10.4s, v10.4s, v5.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v20.4s, v20.4s, v1.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v4.16b, v4.16b, v20.16b
+ add v26.4s, v26.4s, v6.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v2.16b, v26.16b, v2.16b
+ add v19.4s, v19.4s, v25.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v15.16b
- add v17.4s, v17.4s, v18.4s
- add v17.4s, v17.4s, v0.4s
- add v4.4s, v4.4s, v22.4s
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v30.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v25.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v16.16b
- eor v3.16b, v3.16b, v13.16b
- add v17.4s, v17.4s, v26.4s
- mov v26.16b, v21.16b
- add v4.4s, v4.4s, v21.4s
- ldur q21, [x29, #-144]
- ushr v15.4s, v0.4s, #12
+ add v7.4s, v7.4s, v4.4s
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ add v10.4s, v10.4s, v14.4s
+ ldr q14, [sp, #288]
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v2.16b, v2.16b, v13.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v14.4s
+ ushr v13.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v2.4s
+ add v20.4s, v20.4s, v24.4s
+ orr v1.16b, v1.16b, v13.16b
+ eor v4.16b, v4.16b, v10.16b
+ add v17.4s, v17.4s, v9.4s
+ eor v25.16b, v25.16b, v22.16b
+ add v20.4s, v20.4s, v5.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ add v17.4s, v17.4s, v1.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ add v26.4s, v26.4s, v4.4s
+ eor v16.16b, v16.16b, v17.16b
+ add v7.4s, v7.4s, v25.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v2.16b, v7.16b, v2.16b
+ add v21.4s, v21.4s, v6.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v16.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v8.16b, v2.16b
- add v17.4s, v17.4s, v0.4s
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
+ add v19.4s, v19.4s, v16.4s
+ ushr v13.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- eor v14.16b, v14.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v7.4s, v7.4s, v21.4s
- orr v2.16b, v2.16b, v15.16b
- tbl v14.16b, { v14.16b }, v19.16b
- eor v11.16b, v11.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- add v13.4s, v13.4s, v28.4s
- add v1.4s, v1.4s, v14.4s
- tbl v11.16b, { v11.16b }, v19.16b
- eor v12.16b, v12.16b, v7.16b
- add v13.4s, v13.4s, v2.4s
- str q23, [sp, #160]
- eor v0.16b, v0.16b, v1.16b
- add v5.4s, v5.4s, v11.4s
- tbl v12.16b, { v12.16b }, v19.16b
- eor v3.16b, v3.16b, v13.16b
- add v17.4s, v17.4s, v23.4s
- ldur q23, [x29, #-64]
- ushr v15.4s, v0.4s, #7
+ eor v5.16b, v21.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ eor v1.16b, v19.16b, v1.16b
+ add v10.4s, v10.4s, v11.4s
+ orr v2.16b, v2.16b, v13.16b
+ ushr v13.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ushr v12.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ add v22.4s, v22.4s, v15.4s
+ orr v5.16b, v5.16b, v13.16b
+ add v20.4s, v20.4s, v3.4s
+ mov v24.16b, v3.16b
+ ldr q3, [sp, #336]
+ orr v1.16b, v1.16b, v12.16b
+ eor v4.16b, v4.16b, v10.16b
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v3.4s
+ add v20.4s, v20.4s, v5.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v25.16b, v25.16b, v22.16b
+ add v17.4s, v17.4s, v1.4s
+ eor v6.16b, v6.16b, v20.16b
+ add v26.4s, v26.4s, v4.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ eor v16.16b, v16.16b, v17.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ add v7.4s, v7.4s, v25.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ add v21.4s, v21.4s, v6.4s
+ ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
- eor v10.16b, v5.16b, v10.16b
- add v6.4s, v6.4s, v12.4s
- tbl v3.16b, { v3.16b }, v19.16b
- orr v0.16b, v0.16b, v15.16b
- ushr v15.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- eor v9.16b, v6.16b, v9.16b
- add v8.4s, v8.4s, v3.4s
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v2.16b, v8.16b, v2.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #7
+ eor v2.16b, v7.16b, v2.16b
+ add v19.4s, v19.4s, v16.4s
+ eor v5.16b, v21.16b, v5.16b
+ orr v0.16b, v0.16b, v12.16b
+ ushr v12.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v23.4s
- orr v2.16b, v2.16b, v15.16b
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v24.4s
- tbl v3.16b, { v3.16b }, v16.16b
- eor v14.16b, v14.16b, v4.16b
- add v7.4s, v7.4s, v2.4s
- add v6.4s, v6.4s, v3.4s
- tbl v14.16b, { v14.16b }, v16.16b
- eor v11.16b, v11.16b, v7.16b
- add v13.4s, v13.4s, v20.4s
- eor v10.16b, v6.16b, v10.16b
- add v8.4s, v8.4s, v14.4s
- tbl v11.16b, { v11.16b }, v16.16b
- add v13.4s, v13.4s, v0.4s
- ldr q20, [sp, #176]
- ushr v15.4s, v10.4s, #12
- shl v10.4s, v10.4s, #20
- eor v9.16b, v8.16b, v9.16b
- add v1.4s, v1.4s, v11.4s
- eor v12.16b, v12.16b, v13.16b
- orr v10.16b, v10.16b, v15.16b
- ushr v15.4s, v9.4s, #12
- shl v9.4s, v9.4s, #20
- eor v2.16b, v1.16b, v2.16b
- tbl v12.16b, { v12.16b }, v16.16b
- orr v9.16b, v9.16b, v15.16b
- ushr v15.4s, v2.4s, #12
- shl v2.4s, v2.4s, #20
- add v5.4s, v5.4s, v12.4s
+ eor v1.16b, v19.16b, v1.16b
+ ushr v13.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v22.4s, v22.4s, v8.4s
+ orr v2.16b, v2.16b, v12.16b
+ ushr v12.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v13.16b
+ add v22.4s, v22.4s, v0.4s
+ add v10.4s, v10.4s, v29.4s
+ ldr q29, [sp, #208]
add v17.4s, v17.4s, v31.4s
- orr v2.16b, v2.16b, v15.16b
- eor v0.16b, v5.16b, v0.16b
- add v17.4s, v17.4s, v10.4s
- add v4.4s, v4.4s, v20.4s
- add v7.4s, v7.4s, v29.4s
- ushr v15.4s, v0.4s, #12
+ orr v1.16b, v1.16b, v12.16b
+ add v20.4s, v20.4s, v29.4s
+ eor v16.16b, v16.16b, v22.16b
+ add v10.4s, v10.4s, v5.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v25.16b, v25.16b, v10.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ add v21.4s, v21.4s, v16.4s
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v0.16b, v21.16b, v0.16b
+ add v19.4s, v19.4s, v25.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ ushr v12.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v3.16b, v3.16b, v17.16b
- add v4.4s, v4.4s, v9.4s
- add v7.4s, v7.4s, v2.4s
- orr v0.16b, v0.16b, v15.16b
- mov v15.16b, v31.16b
- add v17.4s, v17.4s, v22.4s
- eor v31.16b, v14.16b, v4.16b
- eor v22.16b, v11.16b, v7.16b
- add v11.4s, v13.4s, v27.4s
- tbl v3.16b, { v3.16b }, v19.16b
- add v11.4s, v11.4s, v0.4s
- tbl v31.16b, { v31.16b }, v19.16b
- add v6.4s, v6.4s, v3.4s
- eor v12.16b, v12.16b, v11.16b
- tbl v22.16b, { v22.16b }, v19.16b
- add v8.4s, v8.4s, v31.4s
- eor v10.16b, v6.16b, v10.16b
- add v30.4s, v11.4s, v30.4s
- tbl v11.16b, { v12.16b }, v19.16b
- add v1.4s, v1.4s, v22.4s
- eor v9.16b, v8.16b, v9.16b
- ushr v12.4s, v10.4s, #7
- shl v10.4s, v10.4s, #25
- add v5.4s, v5.4s, v11.4s
- eor v2.16b, v1.16b, v2.16b
- orr v10.16b, v10.16b, v12.16b
- ushr v12.4s, v9.4s, #7
- shl v9.4s, v9.4s, #25
- eor v0.16b, v5.16b, v0.16b
- orr v9.16b, v9.16b, v12.16b
- ushr v12.4s, v2.4s, #7
+ eor v5.16b, v5.16b, v19.16b
+ eor v2.16b, v26.16b, v2.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ ushr v12.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v22.4s, v22.4s, v14.4s
+ mov v8.16b, v31.16b
+ ushr v13.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ mov v31.16b, v14.16b
+ ushr v14.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ orr v5.16b, v5.16b, v12.16b
+ add v22.4s, v22.4s, v0.4s
+ add v10.4s, v10.4s, v28.4s
+ ldr q28, [sp, #352]
+ orr v2.16b, v2.16b, v13.16b
+ orr v1.16b, v1.16b, v14.16b
+ add v17.4s, v17.4s, v30.4s
+ add v20.4s, v20.4s, v3.4s
+ eor v16.16b, v16.16b, v22.16b
+ add v10.4s, v10.4s, v5.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v25.16b, v25.16b, v10.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ add v21.4s, v21.4s, v16.4s
+ tbl v25.16b, { v25.16b }, v18.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v0.16b, v21.16b, v0.16b
+ add v19.4s, v19.4s, v25.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ ushr v12.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ eor v5.16b, v19.16b, v5.16b
+ eor v2.16b, v26.16b, v2.16b
+ eor v1.16b, v7.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ add v10.4s, v10.4s, v23.4s
+ ushr v13.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
+ ushr v14.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v12.16b
+ add v10.4s, v10.4s, v0.4s
+ add v20.4s, v20.4s, v24.4s
+ ldr q24, [sp, #144]
+ orr v2.16b, v2.16b, v13.16b
+ orr v1.16b, v1.16b, v14.16b
+ add v22.4s, v22.4s, v9.4s
+ add v17.4s, v17.4s, v11.4s
+ eor v4.16b, v4.16b, v10.16b
+ add v20.4s, v20.4s, v5.4s
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v27.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v25.16b, v25.16b, v22.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ eor v0.16b, v26.16b, v0.16b
+ add v21.4s, v21.4s, v6.4s
+ add v7.4s, v7.4s, v25.4s
+ add v19.4s, v19.4s, v16.4s
+ ushr v12.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ eor v5.16b, v21.16b, v5.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
+ orr v0.16b, v0.16b, v12.16b
+ add v10.4s, v10.4s, v15.4s
+ ushr v14.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ mov v30.16b, v3.16b
+ ldr q3, [sp, #256]
+ ushr v12.4s, v2.4s, #12
+ shl v2.4s, v2.4s, #20
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ add v10.4s, v10.4s, v0.4s
+ orr v5.16b, v5.16b, v14.16b
+ add v20.4s, v20.4s, v3.4s
orr v2.16b, v2.16b, v12.16b
+ orr v1.16b, v1.16b, v13.16b
+ add v22.4s, v22.4s, v24.4s
+ add v17.4s, v17.4s, v28.4s
+ eor v4.16b, v4.16b, v10.16b
+ add v20.4s, v20.4s, v5.4s
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ tbl v4.16b, { v4.16b }, v18.16b
+ eor v6.16b, v6.16b, v20.16b
+ eor v25.16b, v25.16b, v22.16b
+ eor v16.16b, v16.16b, v17.16b
+ add v26.4s, v26.4s, v4.4s
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ eor v0.16b, v26.16b, v0.16b
+ add v21.4s, v21.4s, v6.4s
+ add v7.4s, v7.4s, v25.4s
+ add v19.4s, v19.4s, v16.4s
ushr v12.4s, v0.4s, #7
shl v0.4s, v0.4s, #25
+ eor v5.16b, v21.16b, v5.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
orr v0.16b, v0.16b, v12.16b
- add v4.4s, v4.4s, v26.4s
- add v17.4s, v17.4s, v0.4s
- add v7.4s, v7.4s, v28.4s
- mov v18.16b, v27.16b
- eor v31.16b, v31.16b, v17.16b
- add v4.4s, v4.4s, v10.4s
- add v27.4s, v30.4s, v2.4s
- eor v22.16b, v22.16b, v4.16b
- add v7.4s, v7.4s, v9.4s
- eor v3.16b, v3.16b, v27.16b
- add v26.4s, v27.4s, v29.4s
- tbl v27.16b, { v31.16b }, v16.16b
- eor v28.16b, v11.16b, v7.16b
- tbl v22.16b, { v22.16b }, v16.16b
- add v1.4s, v1.4s, v27.4s
- add v4.4s, v4.4s, v23.4s
- ldr q23, [sp, #144]
- tbl v28.16b, { v28.16b }, v16.16b
- tbl v3.16b, { v3.16b }, v16.16b
- add v5.4s, v5.4s, v22.4s
- eor v0.16b, v0.16b, v1.16b
- add v6.4s, v6.4s, v28.4s
- add v29.4s, v8.4s, v3.4s
- eor v30.16b, v5.16b, v10.16b
- ushr v8.4s, v0.4s, #12
+ ushr v12.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ mov v23.16b, v9.16b
+ ldr q9, [sp, #112]
+ ushr v13.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ushr v14.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v12.16b
+ add v9.4s, v10.4s, v9.4s
+ orr v2.16b, v2.16b, v13.16b
+ orr v1.16b, v1.16b, v14.16b
+ ldr q14, [sp, #64]
+ add v22.4s, v22.4s, v31.4s
+ add v17.4s, v17.4s, v30.4s
+ add v20.4s, v20.4s, v8.4s
+ add v9.4s, v9.4s, v5.4s
+ add v22.4s, v22.4s, v0.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ eor v25.16b, v25.16b, v9.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ tbl v6.16b, { v6.16b }, v27.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ add v19.4s, v19.4s, v25.4s
+ add v21.4s, v21.4s, v16.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ eor v5.16b, v5.16b, v19.16b
+ eor v0.16b, v21.16b, v0.16b
+ eor v2.16b, v26.16b, v2.16b
+ eor v1.16b, v7.16b, v1.16b
+ ushr v30.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ushr v10.4s, v0.4s, #12
shl v0.4s, v0.4s, #20
- eor v31.16b, v6.16b, v9.16b
- orr v0.16b, v0.16b, v8.16b
- ushr v8.4s, v30.4s, #12
- shl v30.4s, v30.4s, #20
- eor v2.16b, v29.16b, v2.16b
- orr v30.16b, v30.16b, v8.16b
- ushr v8.4s, v31.4s, #12
- shl v31.4s, v31.4s, #20
- add v17.4s, v17.4s, v25.4s
- add v7.4s, v7.4s, v23.4s
- orr v31.16b, v31.16b, v8.16b
- ushr v8.4s, v2.4s, #12
+ ushr v12.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
- ldur q23, [x29, #-176]
- orr v2.16b, v2.16b, v8.16b
- add v17.4s, v17.4s, v0.4s
- eor v27.16b, v27.16b, v17.16b
- add v4.4s, v4.4s, v30.4s
- add v25.4s, v26.4s, v2.4s
- eor v22.16b, v22.16b, v4.16b
- add v4.4s, v4.4s, v24.4s
- add v7.4s, v7.4s, v31.4s
- eor v3.16b, v3.16b, v25.16b
- add v24.4s, v25.4s, v18.4s
- tbl v25.16b, { v27.16b }, v19.16b
+ ushr v13.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ orr v5.16b, v5.16b, v30.16b
+ add v30.4s, v9.4s, v29.4s
+ add v22.4s, v22.4s, v23.4s
+ ldr q23, [sp, #192]
+ orr v0.16b, v0.16b, v10.16b
+ orr v2.16b, v2.16b, v12.16b
+ orr v1.16b, v1.16b, v13.16b
add v17.4s, v17.4s, v23.4s
- eor v23.16b, v28.16b, v7.16b
- tbl v22.16b, { v22.16b }, v19.16b
- add v1.4s, v1.4s, v25.4s
- tbl v23.16b, { v23.16b }, v19.16b
- tbl v3.16b, { v3.16b }, v19.16b
- add v5.4s, v5.4s, v22.4s
- eor v0.16b, v0.16b, v1.16b
- add v6.4s, v6.4s, v23.4s
- add v26.4s, v29.4s, v3.4s
- eor v27.16b, v5.16b, v30.16b
- ushr v29.4s, v0.4s, #7
- shl v0.4s, v0.4s, #25
- eor v28.16b, v6.16b, v31.16b
- orr v0.16b, v0.16b, v29.16b
- ushr v29.4s, v27.4s, #7
- shl v27.4s, v27.4s, #25
+ add v20.4s, v20.4s, v28.4s
+ add v23.4s, v30.4s, v5.4s
+ add v22.4s, v22.4s, v0.4s
+ add v17.4s, v17.4s, v2.4s
+ add v20.4s, v20.4s, v1.4s
+ eor v25.16b, v25.16b, v23.16b
+ eor v16.16b, v16.16b, v22.16b
+ eor v6.16b, v6.16b, v17.16b
+ eor v4.16b, v4.16b, v20.16b
+ tbl v25.16b, { v25.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ add v19.4s, v19.4s, v25.4s
+ add v21.4s, v21.4s, v16.4s
+ add v26.4s, v26.4s, v6.4s
+ add v7.4s, v7.4s, v4.4s
+ eor v5.16b, v19.16b, v5.16b
+ eor v0.16b, v21.16b, v0.16b
eor v2.16b, v26.16b, v2.16b
- orr v27.16b, v27.16b, v29.16b
- ushr v29.4s, v28.4s, #7
- shl v28.4s, v28.4s, #25
- ldur q18, [x29, #-128]
- orr v28.16b, v28.16b, v29.16b
- ushr v29.4s, v2.4s, #7
+ eor v1.16b, v7.16b, v1.16b
+ ushr v28.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ ushr v30.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v31.4s, v2.4s, #7
shl v2.4s, v2.4s, #25
- add v7.4s, v7.4s, v15.4s
- orr v2.16b, v2.16b, v29.16b
- add v17.4s, v17.4s, v27.4s
- add v4.4s, v4.4s, v28.4s
- add v7.4s, v7.4s, v2.4s
- eor v3.16b, v3.16b, v17.16b
- add v17.4s, v17.4s, v20.4s
- eor v20.16b, v25.16b, v4.16b
- add v4.4s, v4.4s, v21.4s
- eor v21.16b, v22.16b, v7.16b
- add v7.4s, v7.4s, v18.4s
- add v18.4s, v24.4s, v0.4s
- eor v22.16b, v23.16b, v18.16b
- ldr q23, [sp, #160]
- tbl v3.16b, { v3.16b }, v16.16b
- tbl v20.16b, { v20.16b }, v16.16b
- add v6.4s, v6.4s, v3.4s
- add v18.4s, v18.4s, v23.4s
- tbl v21.16b, { v21.16b }, v16.16b
- tbl v16.16b, { v22.16b }, v16.16b
- add v22.4s, v26.4s, v20.4s
- eor v23.16b, v6.16b, v27.16b
- add v1.4s, v1.4s, v21.4s
- eor v24.16b, v22.16b, v28.16b
- ushr v25.4s, v23.4s, #12
- shl v23.4s, v23.4s, #20
- add v5.4s, v5.4s, v16.4s
- eor v2.16b, v1.16b, v2.16b
- orr v23.16b, v23.16b, v25.16b
- ushr v25.4s, v24.4s, #12
- shl v24.4s, v24.4s, #20
- eor v0.16b, v5.16b, v0.16b
- orr v24.16b, v24.16b, v25.16b
- ushr v25.4s, v2.4s, #12
+ ushr v8.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ orr v5.16b, v5.16b, v28.16b
+ ldr q28, [sp, #176]
+ orr v0.16b, v0.16b, v30.16b
+ orr v2.16b, v2.16b, v31.16b
+ orr v1.16b, v1.16b, v8.16b
+ add v23.4s, v23.4s, v28.4s
+ add v22.4s, v22.4s, v11.4s
+ add v17.4s, v17.4s, v15.4s
+ add v20.4s, v20.4s, v3.4s
+ ldr q3, [sp, #272]
+ add v23.4s, v23.4s, v0.4s
+ add v22.4s, v22.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ add v20.4s, v20.4s, v5.4s
+ eor v4.16b, v4.16b, v23.16b
+ eor v25.16b, v25.16b, v22.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v6.16b, v6.16b, v20.16b
+ tbl v4.16b, { v4.16b }, v27.16b
+ tbl v25.16b, { v25.16b }, v27.16b
+ tbl v16.16b, { v16.16b }, v27.16b
+ tbl v6.16b, { v6.16b }, v27.16b
+ add v26.4s, v26.4s, v4.4s
+ add v7.4s, v7.4s, v25.4s
+ add v19.4s, v19.4s, v16.4s
+ add v21.4s, v21.4s, v6.4s
+ eor v0.16b, v26.16b, v0.16b
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
+ eor v5.16b, v21.16b, v5.16b
+ add v3.4s, v22.4s, v3.4s
+ ldr q22, [sp, #160]
+ ushr v28.4s, v0.4s, #12
+ shl v0.4s, v0.4s, #20
+ ushr v29.4s, v2.4s, #12
shl v2.4s, v2.4s, #20
+ ushr v30.4s, v1.4s, #12
+ shl v1.4s, v1.4s, #20
+ ushr v31.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ add v17.4s, v17.4s, v22.4s
+ ldr q22, [sp, #240]
+ orr v0.16b, v0.16b, v28.16b
+ prfm pldl1keep, [x23, #256]
+ orr v2.16b, v2.16b, v29.16b
+ prfm pldl1keep, [x24, #256]
+ orr v1.16b, v1.16b, v30.16b
+ prfm pldl1keep, [x22, #256]
+ orr v5.16b, v5.16b, v31.16b
+ prfm pldl1keep, [x25, #256]
+ add v23.4s, v23.4s, v24.4s
+ add v20.4s, v20.4s, v22.4s
+ add v3.4s, v3.4s, v2.4s
+ add v17.4s, v17.4s, v1.4s
+ add v22.4s, v23.4s, v0.4s
+ add v20.4s, v20.4s, v5.4s
+ eor v23.16b, v25.16b, v3.16b
+ eor v16.16b, v16.16b, v17.16b
+ eor v4.16b, v4.16b, v22.16b
+ eor v6.16b, v6.16b, v20.16b
+ tbl v23.16b, { v23.16b }, v18.16b
+ tbl v16.16b, { v16.16b }, v18.16b
+ tbl v4.16b, { v4.16b }, v18.16b
+ tbl v6.16b, { v6.16b }, v18.16b
+ add v7.4s, v7.4s, v23.4s
+ add v19.4s, v19.4s, v16.4s
+ add v18.4s, v26.4s, v4.4s
+ add v21.4s, v21.4s, v6.4s
+ eor v2.16b, v7.16b, v2.16b
+ eor v1.16b, v19.16b, v1.16b
+ eor v0.16b, v18.16b, v0.16b
+ eor v5.16b, v21.16b, v5.16b
+ ushr v25.4s, v2.4s, #7
+ shl v2.4s, v2.4s, #25
+ ushr v24.4s, v0.4s, #7
+ shl v0.4s, v0.4s, #25
+ ushr v26.4s, v1.4s, #7
+ shl v1.4s, v1.4s, #25
+ ushr v27.4s, v5.4s, #7
+ shl v5.4s, v5.4s, #25
+ orr v0.16b, v0.16b, v24.16b
orr v2.16b, v2.16b, v25.16b
- ushr v25.4s, v0.4s, #12
- shl v0.4s, v0.4s, #20
- orr v0.16b, v0.16b, v25.16b
- add v25.4s, v7.4s, v2.4s
- add v26.4s, v18.4s, v0.4s
- eor v18.16b, v21.16b, v25.16b
- add v17.4s, v17.4s, v23.4s
- add v4.4s, v4.4s, v24.4s
- eor v16.16b, v16.16b, v26.16b
- tbl v21.16b, { v18.16b }, v19.16b
- eor v3.16b, v3.16b, v17.16b
- eor v7.16b, v20.16b, v4.16b
- tbl v16.16b, { v16.16b }, v19.16b
- add v1.4s, v1.4s, v21.4s
- tbl v3.16b, { v3.16b }, v19.16b
- tbl v20.16b, { v7.16b }, v19.16b
- eor v2.16b, v1.16b, v2.16b
- eor v7.16b, v1.16b, v17.16b
- add v1.4s, v5.4s, v16.4s
- eor v0.16b, v1.16b, v0.16b
- eor v18.16b, v1.16b, v4.16b
- add v1.4s, v6.4s, v3.4s
- eor v4.16b, v1.16b, v23.16b
- eor v6.16b, v25.16b, v1.16b
- add v1.4s, v22.4s, v20.4s
- eor v5.16b, v1.16b, v24.16b
- eor v17.16b, v26.16b, v1.16b
- ushr v1.4s, v4.4s, #7
+ orr v1.16b, v1.16b, v26.16b
+ orr v5.16b, v5.16b, v27.16b
+ movi v13.4s, #64
+ eor v29.16b, v19.16b, v22.16b
+ eor v8.16b, v21.16b, v3.16b
+ eor v30.16b, v17.16b, v18.16b
+ eor v31.16b, v20.16b, v7.16b
+ eor v24.16b, v5.16b, v23.16b
+ eor v18.16b, v0.16b, v16.16b
+ eor v25.16b, v2.16b, v6.16b
+ eor v26.16b, v1.16b, v4.16b
+ cbnz x21, .LBB3_5
+ b .LBB3_2
+.LBB3_6:
+ cbz x1, .LBB3_14
+ adrp x12, .LCPI3_3
+ ldr q0, [x11, :lo12:.LCPI3_1]
+ orr w11, w7, w6
+ ldr q2, [x10, :lo12:.LCPI3_2]
+ ldr q1, [x12, :lo12:.LCPI3_3]
+ and x12, x5, #0x1
+.LBB3_8:
+ movi v3.4s, #64
+ lsr x13, x4, #32
+ ldp q5, q4, [x3]
+ mov x15, x2
+ mov w14, w11
+ mov v3.s[0], w4
+ ldr x10, [x0]
+ mov v3.s[1], w13
+ b .LBB3_11
+.LBB3_9:
+ orr w14, w14, w9
+.LBB3_10:
+ ldp q6, q7, [x10]
+ mov v16.16b, v3.16b
+ and w14, w14, #0xff
+ add v5.4s, v5.4s, v4.4s
+ mov x15, x13
+ mov v16.s[3], w14
+ add x14, x10, #32
+ uzp1 v17.4s, v6.4s, v7.4s
+ add x10, x10, #64
+ add v5.4s, v5.4s, v17.4s
+ eor v16.16b, v5.16b, v16.16b
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v18.4s, v16.4s, v1.4s
+ eor v19.16b, v18.16b, v4.16b
+ uzp2 v4.4s, v6.4s, v7.4s
+ ushr v6.4s, v19.4s, #12
+ shl v7.4s, v19.4s, #20
+ ld2 { v19.4s, v20.4s }, [x14]
+ add v5.4s, v5.4s, v4.4s
+ mov w14, w6
+ orr v6.16b, v7.16b, v6.16b
+ add v5.4s, v5.4s, v6.4s
+ eor v7.16b, v16.16b, v5.16b
+ add v5.4s, v5.4s, v19.4s
+ tbl v7.16b, { v7.16b }, v2.16b
+ ext v5.16b, v5.16b, v5.16b, #12
+ add v16.4s, v18.4s, v7.4s
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v6.16b, v6.16b, v16.16b
+ ext v16.16b, v16.16b, v16.16b, #4
+ ushr v18.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ orr v6.16b, v6.16b, v18.16b
+ ext v18.16b, v20.16b, v20.16b, #12
+ add v5.4s, v5.4s, v6.4s
+ eor v7.16b, v5.16b, v7.16b
+ add v5.4s, v5.4s, v18.4s
+ tbl v7.16b, { v7.16b }, v0.16b
+ add v16.4s, v16.4s, v7.4s
+ eor v6.16b, v6.16b, v16.16b
+ ushr v21.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ orr v6.16b, v6.16b, v21.16b
+ uzp1 v21.4s, v17.4s, v17.4s
+ add v5.4s, v5.4s, v6.4s
+ ext v21.16b, v21.16b, v17.16b, #8
+ eor v7.16b, v7.16b, v5.16b
+ uzp2 v21.4s, v21.4s, v4.4s
+ tbl v7.16b, { v7.16b }, v2.16b
+ add v5.4s, v5.4s, v21.4s
+ add v16.4s, v16.4s, v7.4s
+ ext v5.16b, v5.16b, v5.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #8
+ eor v6.16b, v6.16b, v16.16b
+ ushr v22.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ orr v6.16b, v6.16b, v22.16b
+ add v22.4s, v5.4s, v6.4s
+ eor v5.16b, v22.16b, v7.16b
+ ext v7.16b, v16.16b, v16.16b, #12
+ tbl v16.16b, { v5.16b }, v0.16b
+ ext v5.16b, v17.16b, v17.16b, #12
+ add v7.4s, v7.4s, v16.4s
+ ext v5.16b, v17.16b, v5.16b, #12
+ ext v17.16b, v19.16b, v19.16b, #12
+ mov v19.16b, v18.16b
+ eor v6.16b, v6.16b, v7.16b
+ rev64 v5.4s, v5.4s
+ mov v19.s[1], v17.s[2]
+ ushr v20.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ trn2 v5.4s, v5.4s, v19.4s
+ orr v6.16b, v6.16b, v20.16b
+ zip1 v20.2d, v18.2d, v4.2d
+ zip2 v4.4s, v4.4s, v18.4s
+ add v19.4s, v6.4s, v5.4s
+ mov v20.s[3], v17.s[3]
+ add v19.4s, v19.4s, v22.4s
+ ext v22.16b, v20.16b, v20.16b, #12
+ eor v16.16b, v16.16b, v19.16b
+ ext v19.16b, v19.16b, v19.16b, #12
+ tbl v16.16b, { v16.16b }, v2.16b
+ add v7.4s, v7.4s, v16.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ eor v6.16b, v6.16b, v7.16b
+ ext v7.16b, v7.16b, v7.16b, #4
+ ushr v23.4s, v6.4s, #7
+ shl v24.4s, v6.4s, #25
+ uzp1 v6.4s, v20.4s, v22.4s
+ orr v20.16b, v24.16b, v23.16b
+ add v22.4s, v20.4s, v6.4s
+ add v19.4s, v22.4s, v19.4s
+ eor v16.16b, v19.16b, v16.16b
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v7.4s, v7.4s, v16.4s
+ eor v18.16b, v20.16b, v7.16b
+ zip1 v20.4s, v4.4s, v17.4s
+ zip1 v4.4s, v17.4s, v4.4s
+ ushr v17.4s, v18.4s, #12
+ shl v18.4s, v18.4s, #20
+ ext v20.16b, v4.16b, v20.16b, #8
+ orr v4.16b, v18.16b, v17.16b
+ ext v18.16b, v21.16b, v21.16b, #4
+ add v17.4s, v4.4s, v20.4s
+ add v17.4s, v17.4s, v19.4s
+ uzp1 v19.4s, v18.4s, v18.4s
+ eor v16.16b, v16.16b, v17.16b
+ ext v19.16b, v19.16b, v18.16b, #8
+ tbl v16.16b, { v16.16b }, v2.16b
+ uzp2 v19.4s, v19.4s, v5.4s
+ add v7.4s, v7.4s, v16.4s
+ add v17.4s, v17.4s, v19.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ eor v4.16b, v4.16b, v7.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ ext v7.16b, v7.16b, v7.16b, #12
+ ushr v21.4s, v4.4s, #7
shl v4.4s, v4.4s, #25
- orr v1.16b, v4.16b, v1.16b
- ushr v4.4s, v5.4s, #7
+ orr v4.16b, v4.16b, v21.16b
+ ext v21.16b, v18.16b, v18.16b, #12
+ add v17.4s, v17.4s, v4.4s
+ ext v18.16b, v18.16b, v21.16b, #12
+ mov v21.16b, v20.16b
+ eor v16.16b, v17.16b, v16.16b
+ rev64 v18.4s, v18.4s
+ mov v21.s[1], v6.s[2]
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v7.4s, v7.4s, v16.4s
+ eor v4.16b, v4.16b, v7.16b
+ ushr v22.4s, v4.4s, #12
+ shl v23.4s, v4.4s, #20
+ trn2 v4.4s, v18.4s, v21.4s
+ orr v18.16b, v23.16b, v22.16b
+ add v21.4s, v18.4s, v4.4s
+ add v17.4s, v21.4s, v17.4s
+ zip1 v21.2d, v20.2d, v5.2d
+ zip2 v5.4s, v5.4s, v20.4s
+ eor v16.16b, v16.16b, v17.16b
+ mov v21.s[3], v6.s[3]
+ ext v17.16b, v17.16b, v17.16b, #12
+ zip1 v20.4s, v5.4s, v6.4s
+ tbl v16.16b, { v16.16b }, v2.16b
+ zip1 v5.4s, v6.4s, v5.4s
+ add v22.4s, v7.4s, v16.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ ext v20.16b, v5.16b, v20.16b, #8
+ eor v7.16b, v18.16b, v22.16b
+ ext v18.16b, v21.16b, v21.16b, #12
+ ushr v23.4s, v7.4s, #7
+ shl v24.4s, v7.4s, #25
+ uzp1 v7.4s, v21.4s, v18.4s
+ orr v18.16b, v24.16b, v23.16b
+ add v21.4s, v18.4s, v7.4s
+ add v17.4s, v21.4s, v17.4s
+ ext v21.16b, v22.16b, v22.16b, #4
+ eor v16.16b, v17.16b, v16.16b
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v21.4s, v21.4s, v16.4s
+ eor v18.16b, v18.16b, v21.16b
+ ushr v6.4s, v18.4s, #12
+ shl v18.4s, v18.4s, #20
+ orr v5.16b, v18.16b, v6.16b
+ add v6.4s, v5.4s, v20.4s
+ add v6.4s, v6.4s, v17.4s
+ ext v17.16b, v19.16b, v19.16b, #4
+ eor v16.16b, v16.16b, v6.16b
+ uzp1 v18.4s, v17.4s, v17.4s
+ tbl v16.16b, { v16.16b }, v2.16b
+ ext v18.16b, v18.16b, v17.16b, #8
+ add v19.4s, v21.4s, v16.4s
+ uzp2 v18.4s, v18.4s, v4.4s
+ ext v16.16b, v16.16b, v16.16b, #8
+ eor v5.16b, v5.16b, v19.16b
+ add v6.4s, v6.4s, v18.4s
+ ext v19.16b, v19.16b, v19.16b, #12
+ ushr v21.4s, v5.4s, #7
shl v5.4s, v5.4s, #25
- orr v4.16b, v5.16b, v4.16b
- ushr v5.4s, v2.4s, #7
- shl v2.4s, v2.4s, #25
- orr v2.16b, v2.16b, v5.16b
- ushr v5.4s, v0.4s, #7
- shl v0.4s, v0.4s, #25
- orr v0.16b, v0.16b, v5.16b
- eor v10.16b, v0.16b, v20.16b
- eor v11.16b, v1.16b, v21.16b
- eor v19.16b, v4.16b, v16.16b
- cmp x0, x22
- eor v16.16b, v2.16b, v3.16b
- mov w6, w19
- b.ne .LBB2_4
-.LBB2_7:
- zip1 v0.4s, v7.4s, v18.4s
- zip2 v1.4s, v7.4s, v18.4s
- zip1 v2.4s, v6.4s, v17.4s
- zip2 v3.4s, v6.4s, v17.4s
- zip1 v4.4s, v10.4s, v11.4s
- zip2 v5.4s, v10.4s, v11.4s
- zip1 v6.4s, v19.4s, v16.4s
- zip2 v7.4s, v19.4s, v16.4s
- add x15, x20, #4
- tst w5, #0x1
- sub x28, x28, #4
- zip1 v16.2d, v0.2d, v2.2d
- zip2 v0.2d, v0.2d, v2.2d
- zip1 v2.2d, v1.2d, v3.2d
- zip2 v1.2d, v1.2d, v3.2d
- zip1 v3.2d, v4.2d, v6.2d
- zip2 v4.2d, v4.2d, v6.2d
- zip1 v6.2d, v5.2d, v7.2d
- zip2 v5.2d, v5.2d, v7.2d
- add x24, x24, #32
- csel x20, x15, x20, ne
- cmp x28, #3
- stp q16, q3, [x26]
- stp q0, q4, [x26, #32]
- stp q2, q6, [x26, #64]
- stp q1, q5, [x26, #96]
- add x26, x26, #128
- b.hi .LBB2_2
-.LBB2_8:
- cbz x28, .LBB2_16
- orr w8, w7, w19
- and x21, x5, #0x1
- stur w8, [x29, #-64]
-.LBB2_10:
- ldr x8, [sp, #40]
- ldr x25, [x24]
- ldur w4, [x29, #-64]
- ldp q1, q0, [x8]
- mov x8, x22
- stp q1, q0, [x29, #-48]
-.LBB2_11:
- subs x23, x8, #1
- b.eq .LBB2_13
- cbnz x8, .LBB2_14
- b .LBB2_15
-.LBB2_13:
- orr w4, w4, w27
-.LBB2_14:
- sub x0, x29, #48
- mov w2, #64
- mov x1, x25
- mov x3, x20
- bl zfs_blake3_compress_in_place_sse41
- add x25, x25, #64
- mov x8, x23
- mov w4, w19
- b .LBB2_11
-.LBB2_15:
- ldp q0, q1, [x29, #-48]
- add x20, x20, x21
- add x24, x24, #8
- subs x28, x28, #1
- stp q0, q1, [x26], #32
- b.ne .LBB2_10
-.LBB2_16:
- add sp, sp, #448
- ldp x20, x19, [sp, #144]
- ldp x22, x21, [sp, #128]
- ldp x24, x23, [sp, #112]
- ldp x26, x25, [sp, #96]
- ldp x28, x27, [sp, #80]
- ldp x29, x30, [sp, #64]
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v5.16b, v5.16b, v21.16b
+ ext v21.16b, v17.16b, v17.16b, #12
+ add v6.4s, v6.4s, v5.4s
+ ext v17.16b, v17.16b, v21.16b, #12
+ mov v21.16b, v20.16b
+ eor v16.16b, v6.16b, v16.16b
+ rev64 v17.4s, v17.4s
+ mov v21.s[1], v7.s[2]
+ tbl v16.16b, { v16.16b }, v0.16b
+ add v19.4s, v19.4s, v16.4s
+ eor v5.16b, v5.16b, v19.16b
+ ushr v22.4s, v5.4s, #12
+ shl v23.4s, v5.4s, #20
+ trn2 v5.4s, v17.4s, v21.4s
+ orr v17.16b, v23.16b, v22.16b
+ add v21.4s, v17.4s, v5.4s
+ add v6.4s, v21.4s, v6.4s
+ eor v16.16b, v16.16b, v6.16b
+ ext v6.16b, v6.16b, v6.16b, #12
+ tbl v21.16b, { v16.16b }, v2.16b
+ zip1 v16.2d, v20.2d, v4.2d
+ zip2 v4.4s, v4.4s, v20.4s
+ add v19.4s, v19.4s, v21.4s
+ mov v16.s[3], v7.s[3]
+ ext v21.16b, v21.16b, v21.16b, #8
+ zip1 v20.4s, v4.4s, v7.4s
+ eor v17.16b, v17.16b, v19.16b
+ ext v22.16b, v16.16b, v16.16b, #12
+ ext v19.16b, v19.16b, v19.16b, #4
+ zip1 v4.4s, v7.4s, v4.4s
+ ushr v23.4s, v17.4s, #7
+ shl v17.4s, v17.4s, #25
+ uzp1 v16.4s, v16.4s, v22.4s
+ ext v4.16b, v4.16b, v20.16b, #8
+ orr v17.16b, v17.16b, v23.16b
+ add v22.4s, v17.4s, v16.4s
+ add v6.4s, v22.4s, v6.4s
+ eor v21.16b, v6.16b, v21.16b
+ tbl v21.16b, { v21.16b }, v0.16b
+ add v19.4s, v19.4s, v21.4s
+ eor v17.16b, v17.16b, v19.16b
+ ushr v7.4s, v17.4s, #12
+ shl v17.4s, v17.4s, #20
+ orr v7.16b, v17.16b, v7.16b
+ add v17.4s, v7.4s, v4.4s
+ add v6.4s, v17.4s, v6.4s
+ ext v17.16b, v18.16b, v18.16b, #4
+ eor v18.16b, v21.16b, v6.16b
+ uzp1 v20.4s, v17.4s, v17.4s
+ tbl v18.16b, { v18.16b }, v2.16b
+ ext v20.16b, v20.16b, v17.16b, #8
+ add v19.4s, v19.4s, v18.4s
+ uzp2 v20.4s, v20.4s, v5.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v7.16b, v7.16b, v19.16b
+ add v6.4s, v6.4s, v20.4s
+ ushr v21.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ ext v6.16b, v6.16b, v6.16b, #4
+ orr v7.16b, v7.16b, v21.16b
+ add v21.4s, v6.4s, v7.4s
+ eor v6.16b, v21.16b, v18.16b
+ ext v18.16b, v19.16b, v19.16b, #12
+ tbl v19.16b, { v6.16b }, v0.16b
+ ext v6.16b, v17.16b, v17.16b, #12
+ add v18.4s, v18.4s, v19.4s
+ ext v6.16b, v17.16b, v6.16b, #12
+ mov v17.16b, v4.16b
+ eor v7.16b, v7.16b, v18.16b
+ rev64 v6.4s, v6.4s
+ mov v17.s[1], v16.s[2]
+ ushr v22.4s, v7.4s, #12
+ shl v7.4s, v7.4s, #20
+ trn2 v6.4s, v6.4s, v17.4s
+ orr v7.16b, v7.16b, v22.16b
+ add v17.4s, v7.4s, v6.4s
+ add v17.4s, v17.4s, v21.4s
+ zip1 v21.2d, v4.2d, v5.2d
+ zip2 v4.4s, v5.4s, v4.4s
+ eor v19.16b, v19.16b, v17.16b
+ mov v21.s[3], v16.s[3]
+ ext v17.16b, v17.16b, v17.16b, #12
+ tbl v19.16b, { v19.16b }, v2.16b
+ ext v22.16b, v21.16b, v21.16b, #12
+ add v18.4s, v18.4s, v19.4s
+ ext v19.16b, v19.16b, v19.16b, #8
+ eor v7.16b, v7.16b, v18.16b
+ ext v18.16b, v18.16b, v18.16b, #4
+ ushr v23.4s, v7.4s, #7
+ shl v24.4s, v7.4s, #25
+ uzp1 v7.4s, v21.4s, v22.4s
+ orr v21.16b, v24.16b, v23.16b
+ add v22.4s, v21.4s, v7.4s
+ add v17.4s, v22.4s, v17.4s
+ eor v19.16b, v17.16b, v19.16b
+ tbl v19.16b, { v19.16b }, v0.16b
+ add v18.4s, v18.4s, v19.4s
+ eor v5.16b, v21.16b, v18.16b
+ zip1 v21.4s, v4.4s, v16.4s
+ zip1 v4.4s, v16.4s, v4.4s
+ ushr v16.4s, v5.4s, #12
+ shl v5.4s, v5.4s, #20
+ ext v21.16b, v4.16b, v21.16b, #8
+ orr v4.16b, v5.16b, v16.16b
+ ext v16.16b, v20.16b, v20.16b, #4
+ mov v23.16b, v21.16b
+ add v5.4s, v4.4s, v21.4s
+ mov v23.s[1], v7.s[2]
+ add v5.4s, v5.4s, v17.4s
+ eor v17.16b, v19.16b, v5.16b
+ uzp1 v19.4s, v16.4s, v16.4s
+ tbl v17.16b, { v17.16b }, v2.16b
+ ext v19.16b, v19.16b, v16.16b, #8
+ add v18.4s, v18.4s, v17.4s
+ uzp2 v19.4s, v19.4s, v6.4s
+ eor v4.16b, v4.16b, v18.16b
+ add v5.4s, v5.4s, v19.4s
+ ext v19.16b, v19.16b, v19.16b, #4
+ ushr v20.4s, v4.4s, #7
+ shl v4.4s, v4.4s, #25
+ ext v5.16b, v5.16b, v5.16b, #4
+ orr v20.16b, v4.16b, v20.16b
+ ext v4.16b, v17.16b, v17.16b, #8
+ add v17.4s, v5.4s, v20.4s
+ ext v5.16b, v18.16b, v18.16b, #12
+ eor v4.16b, v17.16b, v4.16b
+ tbl v18.16b, { v4.16b }, v0.16b
+ ext v4.16b, v16.16b, v16.16b, #12
+ add v22.4s, v5.4s, v18.4s
+ ext v4.16b, v16.16b, v4.16b, #12
+ eor v5.16b, v20.16b, v22.16b
+ rev64 v16.4s, v4.4s
+ ushr v20.4s, v5.4s, #12
+ shl v24.4s, v5.4s, #20
+ trn2 v5.4s, v16.4s, v23.4s
+ orr v16.16b, v24.16b, v20.16b
+ add v20.4s, v16.4s, v5.4s
+ add v17.4s, v20.4s, v17.4s
+ zip1 v20.2d, v21.2d, v6.2d
+ zip2 v6.4s, v6.4s, v21.4s
+ eor v18.16b, v18.16b, v17.16b
+ mov v20.s[3], v7.s[3]
+ ext v17.16b, v17.16b, v17.16b, #12
+ zip1 v21.4s, v6.4s, v7.4s
+ tbl v18.16b, { v18.16b }, v2.16b
+ ext v24.16b, v20.16b, v20.16b, #12
+ zip1 v6.4s, v7.4s, v6.4s
+ add v22.4s, v22.4s, v18.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ ext v6.16b, v6.16b, v21.16b, #8
+ eor v16.16b, v16.16b, v22.16b
+ ext v22.16b, v22.16b, v22.16b, #4
+ zip1 v5.2d, v6.2d, v5.2d
+ zip2 v4.4s, v4.4s, v6.4s
+ ushr v25.4s, v16.4s, #7
+ shl v26.4s, v16.4s, #25
+ uzp1 v16.4s, v20.4s, v24.4s
+ orr v20.16b, v26.16b, v25.16b
+ mov v5.s[3], v16.s[3]
+ add v24.4s, v20.4s, v16.4s
+ add v17.4s, v24.4s, v17.4s
+ eor v18.16b, v17.16b, v18.16b
+ tbl v18.16b, { v18.16b }, v0.16b
+ add v22.4s, v22.4s, v18.4s
+ eor v20.16b, v20.16b, v22.16b
+ ushr v7.4s, v20.4s, #12
+ shl v20.4s, v20.4s, #20
+ orr v7.16b, v20.16b, v7.16b
+ add v20.4s, v7.4s, v6.4s
+ add v17.4s, v20.4s, v17.4s
+ ext v20.16b, v19.16b, v19.16b, #8
+ eor v18.16b, v18.16b, v17.16b
+ ext v17.16b, v17.16b, v17.16b, #4
+ tbl v18.16b, { v18.16b }, v2.16b
+ add v21.4s, v22.4s, v18.4s
+ uzp2 v22.4s, v20.4s, v23.4s
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v7.16b, v7.16b, v21.16b
+ ext v20.16b, v22.16b, v20.16b, #4
+ ushr v22.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ add v17.4s, v17.4s, v20.4s
+ ext v20.16b, v21.16b, v21.16b, #12
+ ext v21.16b, v19.16b, v19.16b, #12
+ orr v7.16b, v7.16b, v22.16b
+ ext v19.16b, v19.16b, v21.16b, #12
+ add v17.4s, v17.4s, v7.4s
+ mov v21.16b, v6.16b
+ rev64 v19.4s, v19.4s
+ eor v18.16b, v17.16b, v18.16b
+ mov v21.s[1], v16.s[2]
+ tbl v18.16b, { v18.16b }, v0.16b
+ trn2 v19.4s, v19.4s, v21.4s
+ add v20.4s, v20.4s, v18.4s
+ eor v7.16b, v7.16b, v20.16b
+ ushr v22.4s, v7.4s, #12
+ shl v7.4s, v7.4s, #20
+ orr v7.16b, v7.16b, v22.16b
+ add v19.4s, v7.4s, v19.4s
+ add v17.4s, v19.4s, v17.4s
+ eor v18.16b, v18.16b, v17.16b
+ ext v17.16b, v17.16b, v17.16b, #12
+ tbl v18.16b, { v18.16b }, v2.16b
+ add v19.4s, v20.4s, v18.4s
+ ext v20.16b, v5.16b, v5.16b, #12
+ ext v18.16b, v18.16b, v18.16b, #8
+ eor v7.16b, v7.16b, v19.16b
+ uzp1 v5.4s, v5.4s, v20.4s
+ ushr v21.4s, v7.4s, #7
+ shl v7.4s, v7.4s, #25
+ orr v7.16b, v7.16b, v21.16b
+ add v5.4s, v7.4s, v5.4s
+ add v5.4s, v5.4s, v17.4s
+ eor v17.16b, v5.16b, v18.16b
+ ext v18.16b, v19.16b, v19.16b, #4
+ tbl v17.16b, { v17.16b }, v0.16b
+ add v18.4s, v18.4s, v17.4s
+ eor v6.16b, v7.16b, v18.16b
+ zip1 v7.4s, v4.4s, v16.4s
+ zip1 v4.4s, v16.4s, v4.4s
+ ushr v16.4s, v6.4s, #12
+ shl v6.4s, v6.4s, #20
+ ext v4.16b, v4.16b, v7.16b, #8
+ orr v6.16b, v6.16b, v16.16b
+ add v4.4s, v6.4s, v4.4s
+ add v4.4s, v4.4s, v5.4s
+ eor v5.16b, v17.16b, v4.16b
+ ext v4.16b, v4.16b, v4.16b, #4
+ tbl v5.16b, { v5.16b }, v2.16b
+ add v7.4s, v18.4s, v5.4s
+ eor v6.16b, v6.16b, v7.16b
+ ext v7.16b, v7.16b, v7.16b, #12
+ ushr v16.4s, v6.4s, #7
+ shl v6.4s, v6.4s, #25
+ orr v6.16b, v6.16b, v16.16b
+ ext v16.16b, v5.16b, v5.16b, #8
+ eor v5.16b, v4.16b, v7.16b
+ eor v4.16b, v6.16b, v16.16b
+.LBB3_11:
+ subs x13, x15, #1
+ b.eq .LBB3_9
+ cbnz x15, .LBB3_10
+ add x4, x4, x12
+ add x0, x0, #8
+ subs x1, x1, #1
+ stp q5, q4, [x8], #32
+ b.ne .LBB3_8
+.LBB3_14:
+ add sp, sp, #368
+ ldp x20, x19, [sp, #128]
+ ldp x22, x21, [sp, #112]
+ ldp x24, x23, [sp, #96]
+ ldp x26, x25, [sp, #80]
+ ldp x29, x27, [sp, #64]
ldp d9, d8, [sp, #48]
ldp d11, d10, [sp, #32]
ldp d13, d12, [sp, #16]
- ldp d15, d14, [sp], #160
+ ldp d15, d14, [sp], #144
ret
-.Lfunc_end2:
- .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41
+.Lfunc_end3:
+ .size zfs_blake3_hash_many_sse41, .Lfunc_end3-zfs_blake3_hash_many_sse41
.cfi_endproc
.section ".note.GNU-stack","",@progbits
-#endif
+#endif \ No newline at end of file