1993 lines
		
	
	
		
			40 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			1993 lines
		
	
	
		
			40 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| // This file is generated from a similarly-named Perl script in the BoringSSL
 | |
| // source tree. Do not edit by hand.
 | |
| 
 | |
| #if !defined(__has_feature)
 | |
| #define __has_feature(x) 0
 | |
| #endif
 | |
| #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
 | |
| #define OPENSSL_NO_ASM
 | |
| #endif
 | |
| 
 | |
| #if !defined(OPENSSL_NO_ASM)
 | |
| #if defined(BORINGSSL_PREFIX)
 | |
| #include <boringssl_prefix_symbols_asm.h>
 | |
| #endif
 | |
| #include <openssl/arm_arch.h>
 | |
| 
 | |
| 
 | |
| .private_extern	_OPENSSL_armcap_P
 | |
| 
 | |
| .section	__TEXT,__const
 | |
| 
 | |
| .align	5
 | |
| Lsigma:
 | |
| .quad	0x3320646e61707865,0x6b20657479622d32		// endian-neutral
 | |
| Lone:
 | |
| .long	1,0,0,0
 | |
| .byte	67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 | |
| .align	2
 | |
| 
 | |
| .text
 | |
| 
 | |
| .globl	_ChaCha20_ctr32
 | |
| .private_extern	_ChaCha20_ctr32
 | |
| 
 | |
| .align	5
 | |
| _ChaCha20_ctr32:
 | |
| 	AARCH64_VALID_CALL_TARGET
 | |
| 	cbz	x2,Labort
 | |
| #if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10
 | |
| 	adrp	x5,:pg_hi21_nc:_OPENSSL_armcap_P
 | |
| #else
 | |
| 	adrp	x5,_OPENSSL_armcap_P@PAGE
 | |
| #endif
 | |
| 	cmp	x2,#192
 | |
| 	b.lo	Lshort
 | |
| 	ldr	w17,[x5,_OPENSSL_armcap_P@PAGEOFF]
 | |
| 	tst	w17,#ARMV7_NEON
 | |
| 	b.ne	ChaCha20_neon
 | |
| 
 | |
| Lshort:
 | |
| 	AARCH64_SIGN_LINK_REGISTER
 | |
| 	stp	x29,x30,[sp,#-96]!
 | |
| 	add	x29,sp,#0
 | |
| 
 | |
| 	adrp	x5,Lsigma@PAGE
 | |
| 	add	x5,x5,Lsigma@PAGEOFF
 | |
| 	stp	x19,x20,[sp,#16]
 | |
| 	stp	x21,x22,[sp,#32]
 | |
| 	stp	x23,x24,[sp,#48]
 | |
| 	stp	x25,x26,[sp,#64]
 | |
| 	stp	x27,x28,[sp,#80]
 | |
| 	sub	sp,sp,#64
 | |
| 
 | |
| 	ldp	x22,x23,[x5]		// load sigma
 | |
| 	ldp	x24,x25,[x3]		// load key
 | |
| 	ldp	x26,x27,[x3,#16]
 | |
| 	ldp	x28,x30,[x4]		// load counter
 | |
| #ifdef	__AARCH64EB__
 | |
| 	ror	x24,x24,#32
 | |
| 	ror	x25,x25,#32
 | |
| 	ror	x26,x26,#32
 | |
| 	ror	x27,x27,#32
 | |
| 	ror	x28,x28,#32
 | |
| 	ror	x30,x30,#32
 | |
| #endif
 | |
| 
 | |
| Loop_outer:
 | |
| 	mov	w5,w22			// unpack key block
 | |
| 	lsr	x6,x22,#32
 | |
| 	mov	w7,w23
 | |
| 	lsr	x8,x23,#32
 | |
| 	mov	w9,w24
 | |
| 	lsr	x10,x24,#32
 | |
| 	mov	w11,w25
 | |
| 	lsr	x12,x25,#32
 | |
| 	mov	w13,w26
 | |
| 	lsr	x14,x26,#32
 | |
| 	mov	w15,w27
 | |
| 	lsr	x16,x27,#32
 | |
| 	mov	w17,w28
 | |
| 	lsr	x19,x28,#32
 | |
| 	mov	w20,w30
 | |
| 	lsr	x21,x30,#32
 | |
| 
 | |
| 	mov	x4,#10
 | |
| 	subs	x2,x2,#64
 | |
| Loop:
 | |
| 	sub	x4,x4,#1
 | |
| 	add	w5,w5,w9
 | |
| 	add	w6,w6,w10
 | |
| 	add	w7,w7,w11
 | |
| 	add	w8,w8,w12
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	w19,w19,w6
 | |
| 	eor	w20,w20,w7
 | |
| 	eor	w21,w21,w8
 | |
| 	ror	w17,w17,#16
 | |
| 	ror	w19,w19,#16
 | |
| 	ror	w20,w20,#16
 | |
| 	ror	w21,w21,#16
 | |
| 	add	w13,w13,w17
 | |
| 	add	w14,w14,w19
 | |
| 	add	w15,w15,w20
 | |
| 	add	w16,w16,w21
 | |
| 	eor	w9,w9,w13
 | |
| 	eor	w10,w10,w14
 | |
| 	eor	w11,w11,w15
 | |
| 	eor	w12,w12,w16
 | |
| 	ror	w9,w9,#20
 | |
| 	ror	w10,w10,#20
 | |
| 	ror	w11,w11,#20
 | |
| 	ror	w12,w12,#20
 | |
| 	add	w5,w5,w9
 | |
| 	add	w6,w6,w10
 | |
| 	add	w7,w7,w11
 | |
| 	add	w8,w8,w12
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	w19,w19,w6
 | |
| 	eor	w20,w20,w7
 | |
| 	eor	w21,w21,w8
 | |
| 	ror	w17,w17,#24
 | |
| 	ror	w19,w19,#24
 | |
| 	ror	w20,w20,#24
 | |
| 	ror	w21,w21,#24
 | |
| 	add	w13,w13,w17
 | |
| 	add	w14,w14,w19
 | |
| 	add	w15,w15,w20
 | |
| 	add	w16,w16,w21
 | |
| 	eor	w9,w9,w13
 | |
| 	eor	w10,w10,w14
 | |
| 	eor	w11,w11,w15
 | |
| 	eor	w12,w12,w16
 | |
| 	ror	w9,w9,#25
 | |
| 	ror	w10,w10,#25
 | |
| 	ror	w11,w11,#25
 | |
| 	ror	w12,w12,#25
 | |
| 	add	w5,w5,w10
 | |
| 	add	w6,w6,w11
 | |
| 	add	w7,w7,w12
 | |
| 	add	w8,w8,w9
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	w17,w17,w6
 | |
| 	eor	w19,w19,w7
 | |
| 	eor	w20,w20,w8
 | |
| 	ror	w21,w21,#16
 | |
| 	ror	w17,w17,#16
 | |
| 	ror	w19,w19,#16
 | |
| 	ror	w20,w20,#16
 | |
| 	add	w15,w15,w21
 | |
| 	add	w16,w16,w17
 | |
| 	add	w13,w13,w19
 | |
| 	add	w14,w14,w20
 | |
| 	eor	w10,w10,w15
 | |
| 	eor	w11,w11,w16
 | |
| 	eor	w12,w12,w13
 | |
| 	eor	w9,w9,w14
 | |
| 	ror	w10,w10,#20
 | |
| 	ror	w11,w11,#20
 | |
| 	ror	w12,w12,#20
 | |
| 	ror	w9,w9,#20
 | |
| 	add	w5,w5,w10
 | |
| 	add	w6,w6,w11
 | |
| 	add	w7,w7,w12
 | |
| 	add	w8,w8,w9
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	w17,w17,w6
 | |
| 	eor	w19,w19,w7
 | |
| 	eor	w20,w20,w8
 | |
| 	ror	w21,w21,#24
 | |
| 	ror	w17,w17,#24
 | |
| 	ror	w19,w19,#24
 | |
| 	ror	w20,w20,#24
 | |
| 	add	w15,w15,w21
 | |
| 	add	w16,w16,w17
 | |
| 	add	w13,w13,w19
 | |
| 	add	w14,w14,w20
 | |
| 	eor	w10,w10,w15
 | |
| 	eor	w11,w11,w16
 | |
| 	eor	w12,w12,w13
 | |
| 	eor	w9,w9,w14
 | |
| 	ror	w10,w10,#25
 | |
| 	ror	w11,w11,#25
 | |
| 	ror	w12,w12,#25
 | |
| 	ror	w9,w9,#25
 | |
| 	cbnz	x4,Loop
 | |
| 
 | |
| 	add	w5,w5,w22		// accumulate key block
 | |
| 	add	x6,x6,x22,lsr#32
 | |
| 	add	w7,w7,w23
 | |
| 	add	x8,x8,x23,lsr#32
 | |
| 	add	w9,w9,w24
 | |
| 	add	x10,x10,x24,lsr#32
 | |
| 	add	w11,w11,w25
 | |
| 	add	x12,x12,x25,lsr#32
 | |
| 	add	w13,w13,w26
 | |
| 	add	x14,x14,x26,lsr#32
 | |
| 	add	w15,w15,w27
 | |
| 	add	x16,x16,x27,lsr#32
 | |
| 	add	w17,w17,w28
 | |
| 	add	x19,x19,x28,lsr#32
 | |
| 	add	w20,w20,w30
 | |
| 	add	x21,x21,x30,lsr#32
 | |
| 
 | |
| 	b.lo	Ltail
 | |
| 
 | |
| 	add	x5,x5,x6,lsl#32	// pack
 | |
| 	add	x7,x7,x8,lsl#32
 | |
| 	ldp	x6,x8,[x1,#0]		// load input
 | |
| 	add	x9,x9,x10,lsl#32
 | |
| 	add	x11,x11,x12,lsl#32
 | |
| 	ldp	x10,x12,[x1,#16]
 | |
| 	add	x13,x13,x14,lsl#32
 | |
| 	add	x15,x15,x16,lsl#32
 | |
| 	ldp	x14,x16,[x1,#32]
 | |
| 	add	x17,x17,x19,lsl#32
 | |
| 	add	x20,x20,x21,lsl#32
 | |
| 	ldp	x19,x21,[x1,#48]
 | |
| 	add	x1,x1,#64
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev	x5,x5
 | |
| 	rev	x7,x7
 | |
| 	rev	x9,x9
 | |
| 	rev	x11,x11
 | |
| 	rev	x13,x13
 | |
| 	rev	x15,x15
 | |
| 	rev	x17,x17
 | |
| 	rev	x20,x20
 | |
| #endif
 | |
| 	eor	x5,x5,x6
 | |
| 	eor	x7,x7,x8
 | |
| 	eor	x9,x9,x10
 | |
| 	eor	x11,x11,x12
 | |
| 	eor	x13,x13,x14
 | |
| 	eor	x15,x15,x16
 | |
| 	eor	x17,x17,x19
 | |
| 	eor	x20,x20,x21
 | |
| 
 | |
| 	stp	x5,x7,[x0,#0]		// store output
 | |
| 	add	x28,x28,#1			// increment counter
 | |
| 	stp	x9,x11,[x0,#16]
 | |
| 	stp	x13,x15,[x0,#32]
 | |
| 	stp	x17,x20,[x0,#48]
 | |
| 	add	x0,x0,#64
 | |
| 
 | |
| 	b.hi	Loop_outer
 | |
| 
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	add	sp,sp,#64
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldp	x29,x30,[sp],#96
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| Labort:
 | |
| 	ret
 | |
| 
 | |
| .align	4
 | |
| Ltail:
 | |
| 	add	x2,x2,#64
 | |
| Less_than_64:
 | |
| 	sub	x0,x0,#1
 | |
| 	add	x1,x1,x2
 | |
| 	add	x0,x0,x2
 | |
| 	add	x4,sp,x2
 | |
| 	neg	x2,x2
 | |
| 
 | |
| 	add	x5,x5,x6,lsl#32	// pack
 | |
| 	add	x7,x7,x8,lsl#32
 | |
| 	add	x9,x9,x10,lsl#32
 | |
| 	add	x11,x11,x12,lsl#32
 | |
| 	add	x13,x13,x14,lsl#32
 | |
| 	add	x15,x15,x16,lsl#32
 | |
| 	add	x17,x17,x19,lsl#32
 | |
| 	add	x20,x20,x21,lsl#32
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev	x5,x5
 | |
| 	rev	x7,x7
 | |
| 	rev	x9,x9
 | |
| 	rev	x11,x11
 | |
| 	rev	x13,x13
 | |
| 	rev	x15,x15
 | |
| 	rev	x17,x17
 | |
| 	rev	x20,x20
 | |
| #endif
 | |
| 	stp	x5,x7,[sp,#0]
 | |
| 	stp	x9,x11,[sp,#16]
 | |
| 	stp	x13,x15,[sp,#32]
 | |
| 	stp	x17,x20,[sp,#48]
 | |
| 
 | |
| Loop_tail:
 | |
| 	ldrb	w10,[x1,x2]
 | |
| 	ldrb	w11,[x4,x2]
 | |
| 	add	x2,x2,#1
 | |
| 	eor	w10,w10,w11
 | |
| 	strb	w10,[x0,x2]
 | |
| 	cbnz	x2,Loop_tail
 | |
| 
 | |
| 	stp	xzr,xzr,[sp,#0]
 | |
| 	stp	xzr,xzr,[sp,#16]
 | |
| 	stp	xzr,xzr,[sp,#32]
 | |
| 	stp	xzr,xzr,[sp,#48]
 | |
| 
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	add	sp,sp,#64
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldp	x29,x30,[sp],#96
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| 
 | |
| 
 | |
| .align	5
 | |
| ChaCha20_neon:
 | |
| 	AARCH64_SIGN_LINK_REGISTER
 | |
| 	stp	x29,x30,[sp,#-96]!
 | |
| 	add	x29,sp,#0
 | |
| 
 | |
| 	adrp	x5,Lsigma@PAGE
 | |
| 	add	x5,x5,Lsigma@PAGEOFF
 | |
| 	stp	x19,x20,[sp,#16]
 | |
| 	stp	x21,x22,[sp,#32]
 | |
| 	stp	x23,x24,[sp,#48]
 | |
| 	stp	x25,x26,[sp,#64]
 | |
| 	stp	x27,x28,[sp,#80]
 | |
| 	cmp	x2,#512
 | |
| 	b.hs	L512_or_more_neon
 | |
| 
 | |
| 	sub	sp,sp,#64
 | |
| 
 | |
| 	ldp	x22,x23,[x5]		// load sigma
 | |
| 	ld1	{v24.4s},[x5],#16
 | |
| 	ldp	x24,x25,[x3]		// load key
 | |
| 	ldp	x26,x27,[x3,#16]
 | |
| 	ld1	{v25.4s,v26.4s},[x3]
 | |
| 	ldp	x28,x30,[x4]		// load counter
 | |
| 	ld1	{v27.4s},[x4]
 | |
| 	ld1	{v31.4s},[x5]
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev64	v24.4s,v24.4s
 | |
| 	ror	x24,x24,#32
 | |
| 	ror	x25,x25,#32
 | |
| 	ror	x26,x26,#32
 | |
| 	ror	x27,x27,#32
 | |
| 	ror	x28,x28,#32
 | |
| 	ror	x30,x30,#32
 | |
| #endif
 | |
| 	add	v27.4s,v27.4s,v31.4s		// += 1
 | |
| 	add	v28.4s,v27.4s,v31.4s
 | |
| 	add	v29.4s,v28.4s,v31.4s
 | |
| 	shl	v31.4s,v31.4s,#2			// 1 -> 4
 | |
| 
 | |
| Loop_outer_neon:
 | |
| 	mov	w5,w22			// unpack key block
 | |
| 	lsr	x6,x22,#32
 | |
| 	mov	v0.16b,v24.16b
 | |
| 	mov	w7,w23
 | |
| 	lsr	x8,x23,#32
 | |
| 	mov	v4.16b,v24.16b
 | |
| 	mov	w9,w24
 | |
| 	lsr	x10,x24,#32
 | |
| 	mov	v16.16b,v24.16b
 | |
| 	mov	w11,w25
 | |
| 	mov	v1.16b,v25.16b
 | |
| 	lsr	x12,x25,#32
 | |
| 	mov	v5.16b,v25.16b
 | |
| 	mov	w13,w26
 | |
| 	mov	v17.16b,v25.16b
 | |
| 	lsr	x14,x26,#32
 | |
| 	mov	v3.16b,v27.16b
 | |
| 	mov	w15,w27
 | |
| 	mov	v7.16b,v28.16b
 | |
| 	lsr	x16,x27,#32
 | |
| 	mov	v19.16b,v29.16b
 | |
| 	mov	w17,w28
 | |
| 	mov	v2.16b,v26.16b
 | |
| 	lsr	x19,x28,#32
 | |
| 	mov	v6.16b,v26.16b
 | |
| 	mov	w20,w30
 | |
| 	mov	v18.16b,v26.16b
 | |
| 	lsr	x21,x30,#32
 | |
| 
 | |
| 	mov	x4,#10
 | |
| 	subs	x2,x2,#256
 | |
| Loop_neon:
 | |
| 	sub	x4,x4,#1
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	add	w5,w5,w9
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	add	w6,w6,w10
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	add	w7,w7,w11
 | |
| 	eor	v3.16b,v3.16b,v0.16b
 | |
| 	add	w8,w8,w12
 | |
| 	eor	v7.16b,v7.16b,v4.16b
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	v19.16b,v19.16b,v16.16b
 | |
| 	eor	w19,w19,w6
 | |
| 	rev32	v3.8h,v3.8h
 | |
| 	eor	w20,w20,w7
 | |
| 	rev32	v7.8h,v7.8h
 | |
| 	eor	w21,w21,w8
 | |
| 	rev32	v19.8h,v19.8h
 | |
| 	ror	w17,w17,#16
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	ror	w19,w19,#16
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	ror	w20,w20,#16
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w21,w21,#16
 | |
| 	eor	v20.16b,v1.16b,v2.16b
 | |
| 	add	w13,w13,w17
 | |
| 	eor	v21.16b,v5.16b,v6.16b
 | |
| 	add	w14,w14,w19
 | |
| 	eor	v22.16b,v17.16b,v18.16b
 | |
| 	add	w15,w15,w20
 | |
| 	ushr	v1.4s,v20.4s,#20
 | |
| 	add	w16,w16,w21
 | |
| 	ushr	v5.4s,v21.4s,#20
 | |
| 	eor	w9,w9,w13
 | |
| 	ushr	v17.4s,v22.4s,#20
 | |
| 	eor	w10,w10,w14
 | |
| 	sli	v1.4s,v20.4s,#12
 | |
| 	eor	w11,w11,w15
 | |
| 	sli	v5.4s,v21.4s,#12
 | |
| 	eor	w12,w12,w16
 | |
| 	sli	v17.4s,v22.4s,#12
 | |
| 	ror	w9,w9,#20
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	eor	v20.16b,v3.16b,v0.16b
 | |
| 	add	w5,w5,w9
 | |
| 	eor	v21.16b,v7.16b,v4.16b
 | |
| 	add	w6,w6,w10
 | |
| 	eor	v22.16b,v19.16b,v16.16b
 | |
| 	add	w7,w7,w11
 | |
| 	ushr	v3.4s,v20.4s,#24
 | |
| 	add	w8,w8,w12
 | |
| 	ushr	v7.4s,v21.4s,#24
 | |
| 	eor	w17,w17,w5
 | |
| 	ushr	v19.4s,v22.4s,#24
 | |
| 	eor	w19,w19,w6
 | |
| 	sli	v3.4s,v20.4s,#8
 | |
| 	eor	w20,w20,w7
 | |
| 	sli	v7.4s,v21.4s,#8
 | |
| 	eor	w21,w21,w8
 | |
| 	sli	v19.4s,v22.4s,#8
 | |
| 	ror	w17,w17,#24
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	ror	w19,w19,#24
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	ror	w20,w20,#24
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w21,w21,#24
 | |
| 	eor	v20.16b,v1.16b,v2.16b
 | |
| 	add	w13,w13,w17
 | |
| 	eor	v21.16b,v5.16b,v6.16b
 | |
| 	add	w14,w14,w19
 | |
| 	eor	v22.16b,v17.16b,v18.16b
 | |
| 	add	w15,w15,w20
 | |
| 	ushr	v1.4s,v20.4s,#25
 | |
| 	add	w16,w16,w21
 | |
| 	ushr	v5.4s,v21.4s,#25
 | |
| 	eor	w9,w9,w13
 | |
| 	ushr	v17.4s,v22.4s,#25
 | |
| 	eor	w10,w10,w14
 | |
| 	sli	v1.4s,v20.4s,#7
 | |
| 	eor	w11,w11,w15
 | |
| 	sli	v5.4s,v21.4s,#7
 | |
| 	eor	w12,w12,w16
 | |
| 	sli	v17.4s,v22.4s,#7
 | |
| 	ror	w9,w9,#25
 | |
| 	ext	v2.16b,v2.16b,v2.16b,#8
 | |
| 	ror	w10,w10,#25
 | |
| 	ext	v6.16b,v6.16b,v6.16b,#8
 | |
| 	ror	w11,w11,#25
 | |
| 	ext	v18.16b,v18.16b,v18.16b,#8
 | |
| 	ror	w12,w12,#25
 | |
| 	ext	v3.16b,v3.16b,v3.16b,#12
 | |
| 	ext	v7.16b,v7.16b,v7.16b,#12
 | |
| 	ext	v19.16b,v19.16b,v19.16b,#12
 | |
| 	ext	v1.16b,v1.16b,v1.16b,#4
 | |
| 	ext	v5.16b,v5.16b,v5.16b,#4
 | |
| 	ext	v17.16b,v17.16b,v17.16b,#4
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	add	w5,w5,w10
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	add	w6,w6,w11
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v3.16b,v3.16b,v0.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v7.16b,v7.16b,v4.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v19.16b,v19.16b,v16.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	rev32	v3.8h,v3.8h
 | |
| 	eor	w19,w19,w7
 | |
| 	rev32	v7.8h,v7.8h
 | |
| 	eor	w20,w20,w8
 | |
| 	rev32	v19.8h,v19.8h
 | |
| 	ror	w21,w21,#16
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	ror	w17,w17,#16
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	ror	w19,w19,#16
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w20,w20,#16
 | |
| 	eor	v20.16b,v1.16b,v2.16b
 | |
| 	add	w15,w15,w21
 | |
| 	eor	v21.16b,v5.16b,v6.16b
 | |
| 	add	w16,w16,w17
 | |
| 	eor	v22.16b,v17.16b,v18.16b
 | |
| 	add	w13,w13,w19
 | |
| 	ushr	v1.4s,v20.4s,#20
 | |
| 	add	w14,w14,w20
 | |
| 	ushr	v5.4s,v21.4s,#20
 | |
| 	eor	w10,w10,w15
 | |
| 	ushr	v17.4s,v22.4s,#20
 | |
| 	eor	w11,w11,w16
 | |
| 	sli	v1.4s,v20.4s,#12
 | |
| 	eor	w12,w12,w13
 | |
| 	sli	v5.4s,v21.4s,#12
 | |
| 	eor	w9,w9,w14
 | |
| 	sli	v17.4s,v22.4s,#12
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	eor	v20.16b,v3.16b,v0.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v21.16b,v7.16b,v4.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v22.16b,v19.16b,v16.16b
 | |
| 	add	w7,w7,w12
 | |
| 	ushr	v3.4s,v20.4s,#24
 | |
| 	add	w8,w8,w9
 | |
| 	ushr	v7.4s,v21.4s,#24
 | |
| 	eor	w21,w21,w5
 | |
| 	ushr	v19.4s,v22.4s,#24
 | |
| 	eor	w17,w17,w6
 | |
| 	sli	v3.4s,v20.4s,#8
 | |
| 	eor	w19,w19,w7
 | |
| 	sli	v7.4s,v21.4s,#8
 | |
| 	eor	w20,w20,w8
 | |
| 	sli	v19.4s,v22.4s,#8
 | |
| 	ror	w21,w21,#24
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	ror	w17,w17,#24
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	ror	w19,w19,#24
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w20,w20,#24
 | |
| 	eor	v20.16b,v1.16b,v2.16b
 | |
| 	add	w15,w15,w21
 | |
| 	eor	v21.16b,v5.16b,v6.16b
 | |
| 	add	w16,w16,w17
 | |
| 	eor	v22.16b,v17.16b,v18.16b
 | |
| 	add	w13,w13,w19
 | |
| 	ushr	v1.4s,v20.4s,#25
 | |
| 	add	w14,w14,w20
 | |
| 	ushr	v5.4s,v21.4s,#25
 | |
| 	eor	w10,w10,w15
 | |
| 	ushr	v17.4s,v22.4s,#25
 | |
| 	eor	w11,w11,w16
 | |
| 	sli	v1.4s,v20.4s,#7
 | |
| 	eor	w12,w12,w13
 | |
| 	sli	v5.4s,v21.4s,#7
 | |
| 	eor	w9,w9,w14
 | |
| 	sli	v17.4s,v22.4s,#7
 | |
| 	ror	w10,w10,#25
 | |
| 	ext	v2.16b,v2.16b,v2.16b,#8
 | |
| 	ror	w11,w11,#25
 | |
| 	ext	v6.16b,v6.16b,v6.16b,#8
 | |
| 	ror	w12,w12,#25
 | |
| 	ext	v18.16b,v18.16b,v18.16b,#8
 | |
| 	ror	w9,w9,#25
 | |
| 	ext	v3.16b,v3.16b,v3.16b,#4
 | |
| 	ext	v7.16b,v7.16b,v7.16b,#4
 | |
| 	ext	v19.16b,v19.16b,v19.16b,#4
 | |
| 	ext	v1.16b,v1.16b,v1.16b,#12
 | |
| 	ext	v5.16b,v5.16b,v5.16b,#12
 | |
| 	ext	v17.16b,v17.16b,v17.16b,#12
 | |
| 	cbnz	x4,Loop_neon
 | |
| 
 | |
| 	add	w5,w5,w22		// accumulate key block
 | |
| 	add	v0.4s,v0.4s,v24.4s
 | |
| 	add	x6,x6,x22,lsr#32
 | |
| 	add	v4.4s,v4.4s,v24.4s
 | |
| 	add	w7,w7,w23
 | |
| 	add	v16.4s,v16.4s,v24.4s
 | |
| 	add	x8,x8,x23,lsr#32
 | |
| 	add	v2.4s,v2.4s,v26.4s
 | |
| 	add	w9,w9,w24
 | |
| 	add	v6.4s,v6.4s,v26.4s
 | |
| 	add	x10,x10,x24,lsr#32
 | |
| 	add	v18.4s,v18.4s,v26.4s
 | |
| 	add	w11,w11,w25
 | |
| 	add	v3.4s,v3.4s,v27.4s
 | |
| 	add	x12,x12,x25,lsr#32
 | |
| 	add	w13,w13,w26
 | |
| 	add	v7.4s,v7.4s,v28.4s
 | |
| 	add	x14,x14,x26,lsr#32
 | |
| 	add	w15,w15,w27
 | |
| 	add	v19.4s,v19.4s,v29.4s
 | |
| 	add	x16,x16,x27,lsr#32
 | |
| 	add	w17,w17,w28
 | |
| 	add	v1.4s,v1.4s,v25.4s
 | |
| 	add	x19,x19,x28,lsr#32
 | |
| 	add	w20,w20,w30
 | |
| 	add	v5.4s,v5.4s,v25.4s
 | |
| 	add	x21,x21,x30,lsr#32
 | |
| 	add	v17.4s,v17.4s,v25.4s
 | |
| 
 | |
| 	b.lo	Ltail_neon
 | |
| 
 | |
| 	add	x5,x5,x6,lsl#32	// pack
 | |
| 	add	x7,x7,x8,lsl#32
 | |
| 	ldp	x6,x8,[x1,#0]		// load input
 | |
| 	add	x9,x9,x10,lsl#32
 | |
| 	add	x11,x11,x12,lsl#32
 | |
| 	ldp	x10,x12,[x1,#16]
 | |
| 	add	x13,x13,x14,lsl#32
 | |
| 	add	x15,x15,x16,lsl#32
 | |
| 	ldp	x14,x16,[x1,#32]
 | |
| 	add	x17,x17,x19,lsl#32
 | |
| 	add	x20,x20,x21,lsl#32
 | |
| 	ldp	x19,x21,[x1,#48]
 | |
| 	add	x1,x1,#64
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev	x5,x5
 | |
| 	rev	x7,x7
 | |
| 	rev	x9,x9
 | |
| 	rev	x11,x11
 | |
| 	rev	x13,x13
 | |
| 	rev	x15,x15
 | |
| 	rev	x17,x17
 | |
| 	rev	x20,x20
 | |
| #endif
 | |
| 	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
 | |
| 	eor	x5,x5,x6
 | |
| 	eor	x7,x7,x8
 | |
| 	eor	x9,x9,x10
 | |
| 	eor	x11,x11,x12
 | |
| 	eor	x13,x13,x14
 | |
| 	eor	v0.16b,v0.16b,v20.16b
 | |
| 	eor	x15,x15,x16
 | |
| 	eor	v1.16b,v1.16b,v21.16b
 | |
| 	eor	x17,x17,x19
 | |
| 	eor	v2.16b,v2.16b,v22.16b
 | |
| 	eor	x20,x20,x21
 | |
| 	eor	v3.16b,v3.16b,v23.16b
 | |
| 	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
 | |
| 
 | |
| 	stp	x5,x7,[x0,#0]		// store output
 | |
| 	add	x28,x28,#4			// increment counter
 | |
| 	stp	x9,x11,[x0,#16]
 | |
| 	add	v27.4s,v27.4s,v31.4s		// += 4
 | |
| 	stp	x13,x15,[x0,#32]
 | |
| 	add	v28.4s,v28.4s,v31.4s
 | |
| 	stp	x17,x20,[x0,#48]
 | |
| 	add	v29.4s,v29.4s,v31.4s
 | |
| 	add	x0,x0,#64
 | |
| 
 | |
| 	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
 | |
| 	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
 | |
| 
 | |
| 	eor	v4.16b,v4.16b,v20.16b
 | |
| 	eor	v5.16b,v5.16b,v21.16b
 | |
| 	eor	v6.16b,v6.16b,v22.16b
 | |
| 	eor	v7.16b,v7.16b,v23.16b
 | |
| 	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
 | |
| 
 | |
| 	eor	v16.16b,v16.16b,v0.16b
 | |
| 	eor	v17.16b,v17.16b,v1.16b
 | |
| 	eor	v18.16b,v18.16b,v2.16b
 | |
| 	eor	v19.16b,v19.16b,v3.16b
 | |
| 	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
 | |
| 
 | |
| 	b.hi	Loop_outer_neon
 | |
| 
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	add	sp,sp,#64
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldp	x29,x30,[sp],#96
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| Ltail_neon:
 | |
| 	add	x2,x2,#256
 | |
| 	cmp	x2,#64
 | |
| 	b.lo	Less_than_64
 | |
| 
 | |
| 	add	x5,x5,x6,lsl#32	// pack
 | |
| 	add	x7,x7,x8,lsl#32
 | |
| 	ldp	x6,x8,[x1,#0]		// load input
 | |
| 	add	x9,x9,x10,lsl#32
 | |
| 	add	x11,x11,x12,lsl#32
 | |
| 	ldp	x10,x12,[x1,#16]
 | |
| 	add	x13,x13,x14,lsl#32
 | |
| 	add	x15,x15,x16,lsl#32
 | |
| 	ldp	x14,x16,[x1,#32]
 | |
| 	add	x17,x17,x19,lsl#32
 | |
| 	add	x20,x20,x21,lsl#32
 | |
| 	ldp	x19,x21,[x1,#48]
 | |
| 	add	x1,x1,#64
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev	x5,x5
 | |
| 	rev	x7,x7
 | |
| 	rev	x9,x9
 | |
| 	rev	x11,x11
 | |
| 	rev	x13,x13
 | |
| 	rev	x15,x15
 | |
| 	rev	x17,x17
 | |
| 	rev	x20,x20
 | |
| #endif
 | |
| 	eor	x5,x5,x6
 | |
| 	eor	x7,x7,x8
 | |
| 	eor	x9,x9,x10
 | |
| 	eor	x11,x11,x12
 | |
| 	eor	x13,x13,x14
 | |
| 	eor	x15,x15,x16
 | |
| 	eor	x17,x17,x19
 | |
| 	eor	x20,x20,x21
 | |
| 
 | |
| 	stp	x5,x7,[x0,#0]		// store output
 | |
| 	add	x28,x28,#4			// increment counter
 | |
| 	stp	x9,x11,[x0,#16]
 | |
| 	stp	x13,x15,[x0,#32]
 | |
| 	stp	x17,x20,[x0,#48]
 | |
| 	add	x0,x0,#64
 | |
| 	b.eq	Ldone_neon
 | |
| 	sub	x2,x2,#64
 | |
| 	cmp	x2,#64
 | |
| 	b.lo	Less_than_128
 | |
| 
 | |
| 	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
 | |
| 	eor	v0.16b,v0.16b,v20.16b
 | |
| 	eor	v1.16b,v1.16b,v21.16b
 | |
| 	eor	v2.16b,v2.16b,v22.16b
 | |
| 	eor	v3.16b,v3.16b,v23.16b
 | |
| 	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
 | |
| 	b.eq	Ldone_neon
 | |
| 	sub	x2,x2,#64
 | |
| 	cmp	x2,#64
 | |
| 	b.lo	Less_than_192
 | |
| 
 | |
| 	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
 | |
| 	eor	v4.16b,v4.16b,v20.16b
 | |
| 	eor	v5.16b,v5.16b,v21.16b
 | |
| 	eor	v6.16b,v6.16b,v22.16b
 | |
| 	eor	v7.16b,v7.16b,v23.16b
 | |
| 	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
 | |
| 	b.eq	Ldone_neon
 | |
| 	sub	x2,x2,#64
 | |
| 
 | |
| 	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[sp]
 | |
| 	b	Last_neon
 | |
| 
 | |
| Less_than_128:
 | |
| 	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[sp]
 | |
| 	b	Last_neon
 | |
| Less_than_192:
 | |
| 	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[sp]
 | |
| 	b	Last_neon
 | |
| 
 | |
| .align	4
 | |
| Last_neon:
 | |
| 	sub	x0,x0,#1
 | |
| 	add	x1,x1,x2
 | |
| 	add	x0,x0,x2
 | |
| 	add	x4,sp,x2
 | |
| 	neg	x2,x2
 | |
| 
 | |
| Loop_tail_neon:
 | |
| 	ldrb	w10,[x1,x2]
 | |
| 	ldrb	w11,[x4,x2]
 | |
| 	add	x2,x2,#1
 | |
| 	eor	w10,w10,w11
 | |
| 	strb	w10,[x0,x2]
 | |
| 	cbnz	x2,Loop_tail_neon
 | |
| 
 | |
| 	stp	xzr,xzr,[sp,#0]
 | |
| 	stp	xzr,xzr,[sp,#16]
 | |
| 	stp	xzr,xzr,[sp,#32]
 | |
| 	stp	xzr,xzr,[sp,#48]
 | |
| 
 | |
| Ldone_neon:
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	add	sp,sp,#64
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldp	x29,x30,[sp],#96
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| 
 | |
| .align	5
 | |
| ChaCha20_512_neon:
 | |
| 	AARCH64_SIGN_LINK_REGISTER
 | |
| 	stp	x29,x30,[sp,#-96]!
 | |
| 	add	x29,sp,#0
 | |
| 
 | |
| 	adrp	x5,Lsigma@PAGE
 | |
| 	add	x5,x5,Lsigma@PAGEOFF
 | |
| 	stp	x19,x20,[sp,#16]
 | |
| 	stp	x21,x22,[sp,#32]
 | |
| 	stp	x23,x24,[sp,#48]
 | |
| 	stp	x25,x26,[sp,#64]
 | |
| 	stp	x27,x28,[sp,#80]
 | |
| 
 | |
| L512_or_more_neon:
 | |
| 	sub	sp,sp,#128+64
 | |
| 
 | |
| 	ldp	x22,x23,[x5]		// load sigma
 | |
| 	ld1	{v24.4s},[x5],#16
 | |
| 	ldp	x24,x25,[x3]		// load key
 | |
| 	ldp	x26,x27,[x3,#16]
 | |
| 	ld1	{v25.4s,v26.4s},[x3]
 | |
| 	ldp	x28,x30,[x4]		// load counter
 | |
| 	ld1	{v27.4s},[x4]
 | |
| 	ld1	{v31.4s},[x5]
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev64	v24.4s,v24.4s
 | |
| 	ror	x24,x24,#32
 | |
| 	ror	x25,x25,#32
 | |
| 	ror	x26,x26,#32
 | |
| 	ror	x27,x27,#32
 | |
| 	ror	x28,x28,#32
 | |
| 	ror	x30,x30,#32
 | |
| #endif
 | |
| 	add	v27.4s,v27.4s,v31.4s		// += 1
 | |
| 	stp	q24,q25,[sp,#0]		// off-load key block, invariant part
 | |
| 	add	v27.4s,v27.4s,v31.4s		// not typo
 | |
| 	str	q26,[sp,#32]
 | |
| 	add	v28.4s,v27.4s,v31.4s
 | |
| 	add	v29.4s,v28.4s,v31.4s
 | |
| 	add	v30.4s,v29.4s,v31.4s
 | |
| 	shl	v31.4s,v31.4s,#2			// 1 -> 4
 | |
| 
 | |
| 	stp	d8,d9,[sp,#128+0]		// meet ABI requirements
 | |
| 	stp	d10,d11,[sp,#128+16]
 | |
| 	stp	d12,d13,[sp,#128+32]
 | |
| 	stp	d14,d15,[sp,#128+48]
 | |
| 
 | |
| 	sub	x2,x2,#512			// not typo
 | |
| 
 | |
| Loop_outer_512_neon:
 | |
| 	mov	v0.16b,v24.16b
 | |
| 	mov	v4.16b,v24.16b
 | |
| 	mov	v8.16b,v24.16b
 | |
| 	mov	v12.16b,v24.16b
 | |
| 	mov	v16.16b,v24.16b
 | |
| 	mov	v20.16b,v24.16b
 | |
| 	mov	v1.16b,v25.16b
 | |
| 	mov	w5,w22			// unpack key block
 | |
| 	mov	v5.16b,v25.16b
 | |
| 	lsr	x6,x22,#32
 | |
| 	mov	v9.16b,v25.16b
 | |
| 	mov	w7,w23
 | |
| 	mov	v13.16b,v25.16b
 | |
| 	lsr	x8,x23,#32
 | |
| 	mov	v17.16b,v25.16b
 | |
| 	mov	w9,w24
 | |
| 	mov	v21.16b,v25.16b
 | |
| 	lsr	x10,x24,#32
 | |
| 	mov	v3.16b,v27.16b
 | |
| 	mov	w11,w25
 | |
| 	mov	v7.16b,v28.16b
 | |
| 	lsr	x12,x25,#32
 | |
| 	mov	v11.16b,v29.16b
 | |
| 	mov	w13,w26
 | |
| 	mov	v15.16b,v30.16b
 | |
| 	lsr	x14,x26,#32
 | |
| 	mov	v2.16b,v26.16b
 | |
| 	mov	w15,w27
 | |
| 	mov	v6.16b,v26.16b
 | |
| 	lsr	x16,x27,#32
 | |
| 	add	v19.4s,v3.4s,v31.4s			// +4
 | |
| 	mov	w17,w28
 | |
| 	add	v23.4s,v7.4s,v31.4s			// +4
 | |
| 	lsr	x19,x28,#32
 | |
| 	mov	v10.16b,v26.16b
 | |
| 	mov	w20,w30
 | |
| 	mov	v14.16b,v26.16b
 | |
| 	lsr	x21,x30,#32
 | |
| 	mov	v18.16b,v26.16b
 | |
| 	stp	q27,q28,[sp,#48]		// off-load key block, variable part
 | |
| 	mov	v22.16b,v26.16b
 | |
| 	str	q29,[sp,#80]
 | |
| 
 | |
| 	mov	x4,#5
 | |
| 	subs	x2,x2,#512
 | |
| Loop_upper_neon:
 | |
| 	sub	x4,x4,#1
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	add	w5,w5,w9
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	add	w6,w6,w10
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	add	w7,w7,w11
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	add	w8,w8,w12
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	eor	w17,w17,w5
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	eor	w19,w19,w6
 | |
| 	eor	v3.16b,v3.16b,v0.16b
 | |
| 	eor	w20,w20,w7
 | |
| 	eor	v7.16b,v7.16b,v4.16b
 | |
| 	eor	w21,w21,w8
 | |
| 	eor	v11.16b,v11.16b,v8.16b
 | |
| 	ror	w17,w17,#16
 | |
| 	eor	v15.16b,v15.16b,v12.16b
 | |
| 	ror	w19,w19,#16
 | |
| 	eor	v19.16b,v19.16b,v16.16b
 | |
| 	ror	w20,w20,#16
 | |
| 	eor	v23.16b,v23.16b,v20.16b
 | |
| 	ror	w21,w21,#16
 | |
| 	rev32	v3.8h,v3.8h
 | |
| 	add	w13,w13,w17
 | |
| 	rev32	v7.8h,v7.8h
 | |
| 	add	w14,w14,w19
 | |
| 	rev32	v11.8h,v11.8h
 | |
| 	add	w15,w15,w20
 | |
| 	rev32	v15.8h,v15.8h
 | |
| 	add	w16,w16,w21
 | |
| 	rev32	v19.8h,v19.8h
 | |
| 	eor	w9,w9,w13
 | |
| 	rev32	v23.8h,v23.8h
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w9
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w10
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w11
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w12
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w19,w19,w6
 | |
| 	ushr	v1.4s,v24.4s,#20
 | |
| 	eor	w20,w20,w7
 | |
| 	ushr	v5.4s,v25.4s,#20
 | |
| 	eor	w21,w21,w8
 | |
| 	ushr	v9.4s,v26.4s,#20
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v13.4s,v27.4s,#20
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v17.4s,v28.4s,#20
 | |
| 	ror	w20,w20,#24
 | |
| 	ushr	v21.4s,v29.4s,#20
 | |
| 	ror	w21,w21,#24
 | |
| 	sli	v1.4s,v24.4s,#12
 | |
| 	add	w13,w13,w17
 | |
| 	sli	v5.4s,v25.4s,#12
 | |
| 	add	w14,w14,w19
 | |
| 	sli	v9.4s,v26.4s,#12
 | |
| 	add	w15,w15,w20
 | |
| 	sli	v13.4s,v27.4s,#12
 | |
| 	add	w16,w16,w21
 | |
| 	sli	v17.4s,v28.4s,#12
 | |
| 	eor	w9,w9,w13
 | |
| 	sli	v21.4s,v29.4s,#12
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	ror	w9,w9,#25
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	ror	w10,w10,#25
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	ror	w11,w11,#25
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	ror	w12,w12,#25
 | |
| 	eor	v24.16b,v3.16b,v0.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v7.16b,v4.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v11.16b,v8.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v15.16b,v12.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v19.16b,v16.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v23.16b,v20.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v3.4s,v24.4s,#24
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v7.4s,v25.4s,#24
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v11.4s,v26.4s,#24
 | |
| 	ror	w21,w21,#16
 | |
| 	ushr	v15.4s,v27.4s,#24
 | |
| 	ror	w17,w17,#16
 | |
| 	ushr	v19.4s,v28.4s,#24
 | |
| 	ror	w19,w19,#16
 | |
| 	ushr	v23.4s,v29.4s,#24
 | |
| 	ror	w20,w20,#16
 | |
| 	sli	v3.4s,v24.4s,#8
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v7.4s,v25.4s,#8
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v11.4s,v26.4s,#8
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v15.4s,v27.4s,#8
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v19.4s,v28.4s,#8
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v23.4s,v29.4s,#8
 | |
| 	eor	w11,w11,w16
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w12,w12,w13
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w9,w9,w14
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v1.4s,v24.4s,#25
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v5.4s,v25.4s,#25
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v9.4s,v26.4s,#25
 | |
| 	ror	w21,w21,#24
 | |
| 	ushr	v13.4s,v27.4s,#25
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v17.4s,v28.4s,#25
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v21.4s,v29.4s,#25
 | |
| 	ror	w20,w20,#24
 | |
| 	sli	v1.4s,v24.4s,#7
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v5.4s,v25.4s,#7
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v9.4s,v26.4s,#7
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v13.4s,v27.4s,#7
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v17.4s,v28.4s,#7
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v21.4s,v29.4s,#7
 | |
| 	eor	w11,w11,w16
 | |
| 	ext	v2.16b,v2.16b,v2.16b,#8
 | |
| 	eor	w12,w12,w13
 | |
| 	ext	v6.16b,v6.16b,v6.16b,#8
 | |
| 	eor	w9,w9,w14
 | |
| 	ext	v10.16b,v10.16b,v10.16b,#8
 | |
| 	ror	w10,w10,#25
 | |
| 	ext	v14.16b,v14.16b,v14.16b,#8
 | |
| 	ror	w11,w11,#25
 | |
| 	ext	v18.16b,v18.16b,v18.16b,#8
 | |
| 	ror	w12,w12,#25
 | |
| 	ext	v22.16b,v22.16b,v22.16b,#8
 | |
| 	ror	w9,w9,#25
 | |
| 	ext	v3.16b,v3.16b,v3.16b,#12
 | |
| 	ext	v7.16b,v7.16b,v7.16b,#12
 | |
| 	ext	v11.16b,v11.16b,v11.16b,#12
 | |
| 	ext	v15.16b,v15.16b,v15.16b,#12
 | |
| 	ext	v19.16b,v19.16b,v19.16b,#12
 | |
| 	ext	v23.16b,v23.16b,v23.16b,#12
 | |
| 	ext	v1.16b,v1.16b,v1.16b,#4
 | |
| 	ext	v5.16b,v5.16b,v5.16b,#4
 | |
| 	ext	v9.16b,v9.16b,v9.16b,#4
 | |
| 	ext	v13.16b,v13.16b,v13.16b,#4
 | |
| 	ext	v17.16b,v17.16b,v17.16b,#4
 | |
| 	ext	v21.16b,v21.16b,v21.16b,#4
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	add	w5,w5,w9
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	add	w6,w6,w10
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	add	w7,w7,w11
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	add	w8,w8,w12
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	eor	w17,w17,w5
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	eor	w19,w19,w6
 | |
| 	eor	v3.16b,v3.16b,v0.16b
 | |
| 	eor	w20,w20,w7
 | |
| 	eor	v7.16b,v7.16b,v4.16b
 | |
| 	eor	w21,w21,w8
 | |
| 	eor	v11.16b,v11.16b,v8.16b
 | |
| 	ror	w17,w17,#16
 | |
| 	eor	v15.16b,v15.16b,v12.16b
 | |
| 	ror	w19,w19,#16
 | |
| 	eor	v19.16b,v19.16b,v16.16b
 | |
| 	ror	w20,w20,#16
 | |
| 	eor	v23.16b,v23.16b,v20.16b
 | |
| 	ror	w21,w21,#16
 | |
| 	rev32	v3.8h,v3.8h
 | |
| 	add	w13,w13,w17
 | |
| 	rev32	v7.8h,v7.8h
 | |
| 	add	w14,w14,w19
 | |
| 	rev32	v11.8h,v11.8h
 | |
| 	add	w15,w15,w20
 | |
| 	rev32	v15.8h,v15.8h
 | |
| 	add	w16,w16,w21
 | |
| 	rev32	v19.8h,v19.8h
 | |
| 	eor	w9,w9,w13
 | |
| 	rev32	v23.8h,v23.8h
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w9
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w10
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w11
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w12
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w19,w19,w6
 | |
| 	ushr	v1.4s,v24.4s,#20
 | |
| 	eor	w20,w20,w7
 | |
| 	ushr	v5.4s,v25.4s,#20
 | |
| 	eor	w21,w21,w8
 | |
| 	ushr	v9.4s,v26.4s,#20
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v13.4s,v27.4s,#20
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v17.4s,v28.4s,#20
 | |
| 	ror	w20,w20,#24
 | |
| 	ushr	v21.4s,v29.4s,#20
 | |
| 	ror	w21,w21,#24
 | |
| 	sli	v1.4s,v24.4s,#12
 | |
| 	add	w13,w13,w17
 | |
| 	sli	v5.4s,v25.4s,#12
 | |
| 	add	w14,w14,w19
 | |
| 	sli	v9.4s,v26.4s,#12
 | |
| 	add	w15,w15,w20
 | |
| 	sli	v13.4s,v27.4s,#12
 | |
| 	add	w16,w16,w21
 | |
| 	sli	v17.4s,v28.4s,#12
 | |
| 	eor	w9,w9,w13
 | |
| 	sli	v21.4s,v29.4s,#12
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	ror	w9,w9,#25
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	ror	w10,w10,#25
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	ror	w11,w11,#25
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	ror	w12,w12,#25
 | |
| 	eor	v24.16b,v3.16b,v0.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v7.16b,v4.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v11.16b,v8.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v15.16b,v12.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v19.16b,v16.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v23.16b,v20.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v3.4s,v24.4s,#24
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v7.4s,v25.4s,#24
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v11.4s,v26.4s,#24
 | |
| 	ror	w21,w21,#16
 | |
| 	ushr	v15.4s,v27.4s,#24
 | |
| 	ror	w17,w17,#16
 | |
| 	ushr	v19.4s,v28.4s,#24
 | |
| 	ror	w19,w19,#16
 | |
| 	ushr	v23.4s,v29.4s,#24
 | |
| 	ror	w20,w20,#16
 | |
| 	sli	v3.4s,v24.4s,#8
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v7.4s,v25.4s,#8
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v11.4s,v26.4s,#8
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v15.4s,v27.4s,#8
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v19.4s,v28.4s,#8
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v23.4s,v29.4s,#8
 | |
| 	eor	w11,w11,w16
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w12,w12,w13
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w9,w9,w14
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v1.4s,v24.4s,#25
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v5.4s,v25.4s,#25
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v9.4s,v26.4s,#25
 | |
| 	ror	w21,w21,#24
 | |
| 	ushr	v13.4s,v27.4s,#25
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v17.4s,v28.4s,#25
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v21.4s,v29.4s,#25
 | |
| 	ror	w20,w20,#24
 | |
| 	sli	v1.4s,v24.4s,#7
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v5.4s,v25.4s,#7
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v9.4s,v26.4s,#7
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v13.4s,v27.4s,#7
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v17.4s,v28.4s,#7
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v21.4s,v29.4s,#7
 | |
| 	eor	w11,w11,w16
 | |
| 	ext	v2.16b,v2.16b,v2.16b,#8
 | |
| 	eor	w12,w12,w13
 | |
| 	ext	v6.16b,v6.16b,v6.16b,#8
 | |
| 	eor	w9,w9,w14
 | |
| 	ext	v10.16b,v10.16b,v10.16b,#8
 | |
| 	ror	w10,w10,#25
 | |
| 	ext	v14.16b,v14.16b,v14.16b,#8
 | |
| 	ror	w11,w11,#25
 | |
| 	ext	v18.16b,v18.16b,v18.16b,#8
 | |
| 	ror	w12,w12,#25
 | |
| 	ext	v22.16b,v22.16b,v22.16b,#8
 | |
| 	ror	w9,w9,#25
 | |
| 	ext	v3.16b,v3.16b,v3.16b,#4
 | |
| 	ext	v7.16b,v7.16b,v7.16b,#4
 | |
| 	ext	v11.16b,v11.16b,v11.16b,#4
 | |
| 	ext	v15.16b,v15.16b,v15.16b,#4
 | |
| 	ext	v19.16b,v19.16b,v19.16b,#4
 | |
| 	ext	v23.16b,v23.16b,v23.16b,#4
 | |
| 	ext	v1.16b,v1.16b,v1.16b,#12
 | |
| 	ext	v5.16b,v5.16b,v5.16b,#12
 | |
| 	ext	v9.16b,v9.16b,v9.16b,#12
 | |
| 	ext	v13.16b,v13.16b,v13.16b,#12
 | |
| 	ext	v17.16b,v17.16b,v17.16b,#12
 | |
| 	ext	v21.16b,v21.16b,v21.16b,#12
 | |
| 	cbnz	x4,Loop_upper_neon
 | |
| 
 | |
| 	add	w5,w5,w22		// accumulate key block
 | |
| 	add	x6,x6,x22,lsr#32
 | |
| 	add	w7,w7,w23
 | |
| 	add	x8,x8,x23,lsr#32
 | |
| 	add	w9,w9,w24
 | |
| 	add	x10,x10,x24,lsr#32
 | |
| 	add	w11,w11,w25
 | |
| 	add	x12,x12,x25,lsr#32
 | |
| 	add	w13,w13,w26
 | |
| 	add	x14,x14,x26,lsr#32
 | |
| 	add	w15,w15,w27
 | |
| 	add	x16,x16,x27,lsr#32
 | |
| 	add	w17,w17,w28
 | |
| 	add	x19,x19,x28,lsr#32
 | |
| 	add	w20,w20,w30
 | |
| 	add	x21,x21,x30,lsr#32
 | |
| 
 | |
| 	add	x5,x5,x6,lsl#32	// pack
 | |
| 	add	x7,x7,x8,lsl#32
 | |
| 	ldp	x6,x8,[x1,#0]		// load input
 | |
| 	add	x9,x9,x10,lsl#32
 | |
| 	add	x11,x11,x12,lsl#32
 | |
| 	ldp	x10,x12,[x1,#16]
 | |
| 	add	x13,x13,x14,lsl#32
 | |
| 	add	x15,x15,x16,lsl#32
 | |
| 	ldp	x14,x16,[x1,#32]
 | |
| 	add	x17,x17,x19,lsl#32
 | |
| 	add	x20,x20,x21,lsl#32
 | |
| 	ldp	x19,x21,[x1,#48]
 | |
| 	add	x1,x1,#64
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev	x5,x5
 | |
| 	rev	x7,x7
 | |
| 	rev	x9,x9
 | |
| 	rev	x11,x11
 | |
| 	rev	x13,x13
 | |
| 	rev	x15,x15
 | |
| 	rev	x17,x17
 | |
| 	rev	x20,x20
 | |
| #endif
 | |
| 	eor	x5,x5,x6
 | |
| 	eor	x7,x7,x8
 | |
| 	eor	x9,x9,x10
 | |
| 	eor	x11,x11,x12
 | |
| 	eor	x13,x13,x14
 | |
| 	eor	x15,x15,x16
 | |
| 	eor	x17,x17,x19
 | |
| 	eor	x20,x20,x21
 | |
| 
 | |
| 	stp	x5,x7,[x0,#0]		// store output
 | |
| 	add	x28,x28,#1			// increment counter
 | |
| 	mov	w5,w22			// unpack key block
 | |
| 	lsr	x6,x22,#32
 | |
| 	stp	x9,x11,[x0,#16]
 | |
| 	mov	w7,w23
 | |
| 	lsr	x8,x23,#32
 | |
| 	stp	x13,x15,[x0,#32]
 | |
| 	mov	w9,w24
 | |
| 	lsr	x10,x24,#32
 | |
| 	stp	x17,x20,[x0,#48]
 | |
| 	add	x0,x0,#64
 | |
| 	mov	w11,w25
 | |
| 	lsr	x12,x25,#32
 | |
| 	mov	w13,w26
 | |
| 	lsr	x14,x26,#32
 | |
| 	mov	w15,w27
 | |
| 	lsr	x16,x27,#32
 | |
| 	mov	w17,w28
 | |
| 	lsr	x19,x28,#32
 | |
| 	mov	w20,w30
 | |
| 	lsr	x21,x30,#32
 | |
| 
 | |
| 	mov	x4,#5
 | |
| Loop_lower_neon:
 | |
| 	sub	x4,x4,#1
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	add	w5,w5,w9
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	add	w6,w6,w10
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	add	w7,w7,w11
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	add	w8,w8,w12
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	eor	w17,w17,w5
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	eor	w19,w19,w6
 | |
| 	eor	v3.16b,v3.16b,v0.16b
 | |
| 	eor	w20,w20,w7
 | |
| 	eor	v7.16b,v7.16b,v4.16b
 | |
| 	eor	w21,w21,w8
 | |
| 	eor	v11.16b,v11.16b,v8.16b
 | |
| 	ror	w17,w17,#16
 | |
| 	eor	v15.16b,v15.16b,v12.16b
 | |
| 	ror	w19,w19,#16
 | |
| 	eor	v19.16b,v19.16b,v16.16b
 | |
| 	ror	w20,w20,#16
 | |
| 	eor	v23.16b,v23.16b,v20.16b
 | |
| 	ror	w21,w21,#16
 | |
| 	rev32	v3.8h,v3.8h
 | |
| 	add	w13,w13,w17
 | |
| 	rev32	v7.8h,v7.8h
 | |
| 	add	w14,w14,w19
 | |
| 	rev32	v11.8h,v11.8h
 | |
| 	add	w15,w15,w20
 | |
| 	rev32	v15.8h,v15.8h
 | |
| 	add	w16,w16,w21
 | |
| 	rev32	v19.8h,v19.8h
 | |
| 	eor	w9,w9,w13
 | |
| 	rev32	v23.8h,v23.8h
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w9
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w10
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w11
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w12
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w19,w19,w6
 | |
| 	ushr	v1.4s,v24.4s,#20
 | |
| 	eor	w20,w20,w7
 | |
| 	ushr	v5.4s,v25.4s,#20
 | |
| 	eor	w21,w21,w8
 | |
| 	ushr	v9.4s,v26.4s,#20
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v13.4s,v27.4s,#20
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v17.4s,v28.4s,#20
 | |
| 	ror	w20,w20,#24
 | |
| 	ushr	v21.4s,v29.4s,#20
 | |
| 	ror	w21,w21,#24
 | |
| 	sli	v1.4s,v24.4s,#12
 | |
| 	add	w13,w13,w17
 | |
| 	sli	v5.4s,v25.4s,#12
 | |
| 	add	w14,w14,w19
 | |
| 	sli	v9.4s,v26.4s,#12
 | |
| 	add	w15,w15,w20
 | |
| 	sli	v13.4s,v27.4s,#12
 | |
| 	add	w16,w16,w21
 | |
| 	sli	v17.4s,v28.4s,#12
 | |
| 	eor	w9,w9,w13
 | |
| 	sli	v21.4s,v29.4s,#12
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	ror	w9,w9,#25
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	ror	w10,w10,#25
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	ror	w11,w11,#25
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	ror	w12,w12,#25
 | |
| 	eor	v24.16b,v3.16b,v0.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v7.16b,v4.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v11.16b,v8.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v15.16b,v12.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v19.16b,v16.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v23.16b,v20.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v3.4s,v24.4s,#24
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v7.4s,v25.4s,#24
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v11.4s,v26.4s,#24
 | |
| 	ror	w21,w21,#16
 | |
| 	ushr	v15.4s,v27.4s,#24
 | |
| 	ror	w17,w17,#16
 | |
| 	ushr	v19.4s,v28.4s,#24
 | |
| 	ror	w19,w19,#16
 | |
| 	ushr	v23.4s,v29.4s,#24
 | |
| 	ror	w20,w20,#16
 | |
| 	sli	v3.4s,v24.4s,#8
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v7.4s,v25.4s,#8
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v11.4s,v26.4s,#8
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v15.4s,v27.4s,#8
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v19.4s,v28.4s,#8
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v23.4s,v29.4s,#8
 | |
| 	eor	w11,w11,w16
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w12,w12,w13
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w9,w9,w14
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v1.4s,v24.4s,#25
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v5.4s,v25.4s,#25
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v9.4s,v26.4s,#25
 | |
| 	ror	w21,w21,#24
 | |
| 	ushr	v13.4s,v27.4s,#25
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v17.4s,v28.4s,#25
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v21.4s,v29.4s,#25
 | |
| 	ror	w20,w20,#24
 | |
| 	sli	v1.4s,v24.4s,#7
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v5.4s,v25.4s,#7
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v9.4s,v26.4s,#7
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v13.4s,v27.4s,#7
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v17.4s,v28.4s,#7
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v21.4s,v29.4s,#7
 | |
| 	eor	w11,w11,w16
 | |
| 	ext	v2.16b,v2.16b,v2.16b,#8
 | |
| 	eor	w12,w12,w13
 | |
| 	ext	v6.16b,v6.16b,v6.16b,#8
 | |
| 	eor	w9,w9,w14
 | |
| 	ext	v10.16b,v10.16b,v10.16b,#8
 | |
| 	ror	w10,w10,#25
 | |
| 	ext	v14.16b,v14.16b,v14.16b,#8
 | |
| 	ror	w11,w11,#25
 | |
| 	ext	v18.16b,v18.16b,v18.16b,#8
 | |
| 	ror	w12,w12,#25
 | |
| 	ext	v22.16b,v22.16b,v22.16b,#8
 | |
| 	ror	w9,w9,#25
 | |
| 	ext	v3.16b,v3.16b,v3.16b,#12
 | |
| 	ext	v7.16b,v7.16b,v7.16b,#12
 | |
| 	ext	v11.16b,v11.16b,v11.16b,#12
 | |
| 	ext	v15.16b,v15.16b,v15.16b,#12
 | |
| 	ext	v19.16b,v19.16b,v19.16b,#12
 | |
| 	ext	v23.16b,v23.16b,v23.16b,#12
 | |
| 	ext	v1.16b,v1.16b,v1.16b,#4
 | |
| 	ext	v5.16b,v5.16b,v5.16b,#4
 | |
| 	ext	v9.16b,v9.16b,v9.16b,#4
 | |
| 	ext	v13.16b,v13.16b,v13.16b,#4
 | |
| 	ext	v17.16b,v17.16b,v17.16b,#4
 | |
| 	ext	v21.16b,v21.16b,v21.16b,#4
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	add	w5,w5,w9
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	add	w6,w6,w10
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	add	w7,w7,w11
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	add	w8,w8,w12
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	eor	w17,w17,w5
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	eor	w19,w19,w6
 | |
| 	eor	v3.16b,v3.16b,v0.16b
 | |
| 	eor	w20,w20,w7
 | |
| 	eor	v7.16b,v7.16b,v4.16b
 | |
| 	eor	w21,w21,w8
 | |
| 	eor	v11.16b,v11.16b,v8.16b
 | |
| 	ror	w17,w17,#16
 | |
| 	eor	v15.16b,v15.16b,v12.16b
 | |
| 	ror	w19,w19,#16
 | |
| 	eor	v19.16b,v19.16b,v16.16b
 | |
| 	ror	w20,w20,#16
 | |
| 	eor	v23.16b,v23.16b,v20.16b
 | |
| 	ror	w21,w21,#16
 | |
| 	rev32	v3.8h,v3.8h
 | |
| 	add	w13,w13,w17
 | |
| 	rev32	v7.8h,v7.8h
 | |
| 	add	w14,w14,w19
 | |
| 	rev32	v11.8h,v11.8h
 | |
| 	add	w15,w15,w20
 | |
| 	rev32	v15.8h,v15.8h
 | |
| 	add	w16,w16,w21
 | |
| 	rev32	v19.8h,v19.8h
 | |
| 	eor	w9,w9,w13
 | |
| 	rev32	v23.8h,v23.8h
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w9
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w10
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w11
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w12
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w17,w17,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w19,w19,w6
 | |
| 	ushr	v1.4s,v24.4s,#20
 | |
| 	eor	w20,w20,w7
 | |
| 	ushr	v5.4s,v25.4s,#20
 | |
| 	eor	w21,w21,w8
 | |
| 	ushr	v9.4s,v26.4s,#20
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v13.4s,v27.4s,#20
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v17.4s,v28.4s,#20
 | |
| 	ror	w20,w20,#24
 | |
| 	ushr	v21.4s,v29.4s,#20
 | |
| 	ror	w21,w21,#24
 | |
| 	sli	v1.4s,v24.4s,#12
 | |
| 	add	w13,w13,w17
 | |
| 	sli	v5.4s,v25.4s,#12
 | |
| 	add	w14,w14,w19
 | |
| 	sli	v9.4s,v26.4s,#12
 | |
| 	add	w15,w15,w20
 | |
| 	sli	v13.4s,v27.4s,#12
 | |
| 	add	w16,w16,w21
 | |
| 	sli	v17.4s,v28.4s,#12
 | |
| 	eor	w9,w9,w13
 | |
| 	sli	v21.4s,v29.4s,#12
 | |
| 	eor	w10,w10,w14
 | |
| 	add	v0.4s,v0.4s,v1.4s
 | |
| 	eor	w11,w11,w15
 | |
| 	add	v4.4s,v4.4s,v5.4s
 | |
| 	eor	w12,w12,w16
 | |
| 	add	v8.4s,v8.4s,v9.4s
 | |
| 	ror	w9,w9,#25
 | |
| 	add	v12.4s,v12.4s,v13.4s
 | |
| 	ror	w10,w10,#25
 | |
| 	add	v16.4s,v16.4s,v17.4s
 | |
| 	ror	w11,w11,#25
 | |
| 	add	v20.4s,v20.4s,v21.4s
 | |
| 	ror	w12,w12,#25
 | |
| 	eor	v24.16b,v3.16b,v0.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v7.16b,v4.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v11.16b,v8.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v15.16b,v12.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v19.16b,v16.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v23.16b,v20.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v3.4s,v24.4s,#24
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v7.4s,v25.4s,#24
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v11.4s,v26.4s,#24
 | |
| 	ror	w21,w21,#16
 | |
| 	ushr	v15.4s,v27.4s,#24
 | |
| 	ror	w17,w17,#16
 | |
| 	ushr	v19.4s,v28.4s,#24
 | |
| 	ror	w19,w19,#16
 | |
| 	ushr	v23.4s,v29.4s,#24
 | |
| 	ror	w20,w20,#16
 | |
| 	sli	v3.4s,v24.4s,#8
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v7.4s,v25.4s,#8
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v11.4s,v26.4s,#8
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v15.4s,v27.4s,#8
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v19.4s,v28.4s,#8
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v23.4s,v29.4s,#8
 | |
| 	eor	w11,w11,w16
 | |
| 	add	v2.4s,v2.4s,v3.4s
 | |
| 	eor	w12,w12,w13
 | |
| 	add	v6.4s,v6.4s,v7.4s
 | |
| 	eor	w9,w9,w14
 | |
| 	add	v10.4s,v10.4s,v11.4s
 | |
| 	ror	w10,w10,#20
 | |
| 	add	v14.4s,v14.4s,v15.4s
 | |
| 	ror	w11,w11,#20
 | |
| 	add	v18.4s,v18.4s,v19.4s
 | |
| 	ror	w12,w12,#20
 | |
| 	add	v22.4s,v22.4s,v23.4s
 | |
| 	ror	w9,w9,#20
 | |
| 	eor	v24.16b,v1.16b,v2.16b
 | |
| 	add	w5,w5,w10
 | |
| 	eor	v25.16b,v5.16b,v6.16b
 | |
| 	add	w6,w6,w11
 | |
| 	eor	v26.16b,v9.16b,v10.16b
 | |
| 	add	w7,w7,w12
 | |
| 	eor	v27.16b,v13.16b,v14.16b
 | |
| 	add	w8,w8,w9
 | |
| 	eor	v28.16b,v17.16b,v18.16b
 | |
| 	eor	w21,w21,w5
 | |
| 	eor	v29.16b,v21.16b,v22.16b
 | |
| 	eor	w17,w17,w6
 | |
| 	ushr	v1.4s,v24.4s,#25
 | |
| 	eor	w19,w19,w7
 | |
| 	ushr	v5.4s,v25.4s,#25
 | |
| 	eor	w20,w20,w8
 | |
| 	ushr	v9.4s,v26.4s,#25
 | |
| 	ror	w21,w21,#24
 | |
| 	ushr	v13.4s,v27.4s,#25
 | |
| 	ror	w17,w17,#24
 | |
| 	ushr	v17.4s,v28.4s,#25
 | |
| 	ror	w19,w19,#24
 | |
| 	ushr	v21.4s,v29.4s,#25
 | |
| 	ror	w20,w20,#24
 | |
| 	sli	v1.4s,v24.4s,#7
 | |
| 	add	w15,w15,w21
 | |
| 	sli	v5.4s,v25.4s,#7
 | |
| 	add	w16,w16,w17
 | |
| 	sli	v9.4s,v26.4s,#7
 | |
| 	add	w13,w13,w19
 | |
| 	sli	v13.4s,v27.4s,#7
 | |
| 	add	w14,w14,w20
 | |
| 	sli	v17.4s,v28.4s,#7
 | |
| 	eor	w10,w10,w15
 | |
| 	sli	v21.4s,v29.4s,#7
 | |
| 	eor	w11,w11,w16
 | |
| 	ext	v2.16b,v2.16b,v2.16b,#8
 | |
| 	eor	w12,w12,w13
 | |
| 	ext	v6.16b,v6.16b,v6.16b,#8
 | |
| 	eor	w9,w9,w14
 | |
| 	ext	v10.16b,v10.16b,v10.16b,#8
 | |
| 	ror	w10,w10,#25
 | |
| 	ext	v14.16b,v14.16b,v14.16b,#8
 | |
| 	ror	w11,w11,#25
 | |
| 	ext	v18.16b,v18.16b,v18.16b,#8
 | |
| 	ror	w12,w12,#25
 | |
| 	ext	v22.16b,v22.16b,v22.16b,#8
 | |
| 	ror	w9,w9,#25
 | |
| 	ext	v3.16b,v3.16b,v3.16b,#4
 | |
| 	ext	v7.16b,v7.16b,v7.16b,#4
 | |
| 	ext	v11.16b,v11.16b,v11.16b,#4
 | |
| 	ext	v15.16b,v15.16b,v15.16b,#4
 | |
| 	ext	v19.16b,v19.16b,v19.16b,#4
 | |
| 	ext	v23.16b,v23.16b,v23.16b,#4
 | |
| 	ext	v1.16b,v1.16b,v1.16b,#12
 | |
| 	ext	v5.16b,v5.16b,v5.16b,#12
 | |
| 	ext	v9.16b,v9.16b,v9.16b,#12
 | |
| 	ext	v13.16b,v13.16b,v13.16b,#12
 | |
| 	ext	v17.16b,v17.16b,v17.16b,#12
 | |
| 	ext	v21.16b,v21.16b,v21.16b,#12
 | |
| 	cbnz	x4,Loop_lower_neon
 | |
| 
 | |
| 	add	w5,w5,w22		// accumulate key block
 | |
| 	ldp	q24,q25,[sp,#0]
 | |
| 	add	x6,x6,x22,lsr#32
 | |
| 	ldp	q26,q27,[sp,#32]
 | |
| 	add	w7,w7,w23
 | |
| 	ldp	q28,q29,[sp,#64]
 | |
| 	add	x8,x8,x23,lsr#32
 | |
| 	add	v0.4s,v0.4s,v24.4s
 | |
| 	add	w9,w9,w24
 | |
| 	add	v4.4s,v4.4s,v24.4s
 | |
| 	add	x10,x10,x24,lsr#32
 | |
| 	add	v8.4s,v8.4s,v24.4s
 | |
| 	add	w11,w11,w25
 | |
| 	add	v12.4s,v12.4s,v24.4s
 | |
| 	add	x12,x12,x25,lsr#32
 | |
| 	add	v16.4s,v16.4s,v24.4s
 | |
| 	add	w13,w13,w26
 | |
| 	add	v20.4s,v20.4s,v24.4s
 | |
| 	add	x14,x14,x26,lsr#32
 | |
| 	add	v2.4s,v2.4s,v26.4s
 | |
| 	add	w15,w15,w27
 | |
| 	add	v6.4s,v6.4s,v26.4s
 | |
| 	add	x16,x16,x27,lsr#32
 | |
| 	add	v10.4s,v10.4s,v26.4s
 | |
| 	add	w17,w17,w28
 | |
| 	add	v14.4s,v14.4s,v26.4s
 | |
| 	add	x19,x19,x28,lsr#32
 | |
| 	add	v18.4s,v18.4s,v26.4s
 | |
| 	add	w20,w20,w30
 | |
| 	add	v22.4s,v22.4s,v26.4s
 | |
| 	add	x21,x21,x30,lsr#32
 | |
| 	add	v19.4s,v19.4s,v31.4s			// +4
 | |
| 	add	x5,x5,x6,lsl#32	// pack
 | |
| 	add	v23.4s,v23.4s,v31.4s			// +4
 | |
| 	add	x7,x7,x8,lsl#32
 | |
| 	add	v3.4s,v3.4s,v27.4s
 | |
| 	ldp	x6,x8,[x1,#0]		// load input
 | |
| 	add	v7.4s,v7.4s,v28.4s
 | |
| 	add	x9,x9,x10,lsl#32
 | |
| 	add	v11.4s,v11.4s,v29.4s
 | |
| 	add	x11,x11,x12,lsl#32
 | |
| 	add	v15.4s,v15.4s,v30.4s
 | |
| 	ldp	x10,x12,[x1,#16]
 | |
| 	add	v19.4s,v19.4s,v27.4s
 | |
| 	add	x13,x13,x14,lsl#32
 | |
| 	add	v23.4s,v23.4s,v28.4s
 | |
| 	add	x15,x15,x16,lsl#32
 | |
| 	add	v1.4s,v1.4s,v25.4s
 | |
| 	ldp	x14,x16,[x1,#32]
 | |
| 	add	v5.4s,v5.4s,v25.4s
 | |
| 	add	x17,x17,x19,lsl#32
 | |
| 	add	v9.4s,v9.4s,v25.4s
 | |
| 	add	x20,x20,x21,lsl#32
 | |
| 	add	v13.4s,v13.4s,v25.4s
 | |
| 	ldp	x19,x21,[x1,#48]
 | |
| 	add	v17.4s,v17.4s,v25.4s
 | |
| 	add	x1,x1,#64
 | |
| 	add	v21.4s,v21.4s,v25.4s
 | |
| 
 | |
| #ifdef	__AARCH64EB__
 | |
| 	rev	x5,x5
 | |
| 	rev	x7,x7
 | |
| 	rev	x9,x9
 | |
| 	rev	x11,x11
 | |
| 	rev	x13,x13
 | |
| 	rev	x15,x15
 | |
| 	rev	x17,x17
 | |
| 	rev	x20,x20
 | |
| #endif
 | |
| 	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
 | |
| 	eor	x5,x5,x6
 | |
| 	eor	x7,x7,x8
 | |
| 	eor	x9,x9,x10
 | |
| 	eor	x11,x11,x12
 | |
| 	eor	x13,x13,x14
 | |
| 	eor	v0.16b,v0.16b,v24.16b
 | |
| 	eor	x15,x15,x16
 | |
| 	eor	v1.16b,v1.16b,v25.16b
 | |
| 	eor	x17,x17,x19
 | |
| 	eor	v2.16b,v2.16b,v26.16b
 | |
| 	eor	x20,x20,x21
 | |
| 	eor	v3.16b,v3.16b,v27.16b
 | |
| 	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
 | |
| 
 | |
| 	stp	x5,x7,[x0,#0]		// store output
 | |
| 	add	x28,x28,#7			// increment counter
 | |
| 	stp	x9,x11,[x0,#16]
 | |
| 	stp	x13,x15,[x0,#32]
 | |
| 	stp	x17,x20,[x0,#48]
 | |
| 	add	x0,x0,#64
 | |
| 	st1	{v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
 | |
| 
 | |
| 	ld1	{v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
 | |
| 	eor	v4.16b,v4.16b,v24.16b
 | |
| 	eor	v5.16b,v5.16b,v25.16b
 | |
| 	eor	v6.16b,v6.16b,v26.16b
 | |
| 	eor	v7.16b,v7.16b,v27.16b
 | |
| 	st1	{v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
 | |
| 
 | |
| 	ld1	{v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
 | |
| 	eor	v8.16b,v8.16b,v0.16b
 | |
| 	ldp	q24,q25,[sp,#0]
 | |
| 	eor	v9.16b,v9.16b,v1.16b
 | |
| 	ldp	q26,q27,[sp,#32]
 | |
| 	eor	v10.16b,v10.16b,v2.16b
 | |
| 	eor	v11.16b,v11.16b,v3.16b
 | |
| 	st1	{v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
 | |
| 
 | |
| 	ld1	{v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
 | |
| 	eor	v12.16b,v12.16b,v4.16b
 | |
| 	eor	v13.16b,v13.16b,v5.16b
 | |
| 	eor	v14.16b,v14.16b,v6.16b
 | |
| 	eor	v15.16b,v15.16b,v7.16b
 | |
| 	st1	{v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
 | |
| 
 | |
| 	ld1	{v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
 | |
| 	eor	v16.16b,v16.16b,v8.16b
 | |
| 	eor	v17.16b,v17.16b,v9.16b
 | |
| 	eor	v18.16b,v18.16b,v10.16b
 | |
| 	eor	v19.16b,v19.16b,v11.16b
 | |
| 	st1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
 | |
| 
 | |
| 	shl	v0.4s,v31.4s,#1			// 4 -> 8
 | |
| 	eor	v20.16b,v20.16b,v12.16b
 | |
| 	eor	v21.16b,v21.16b,v13.16b
 | |
| 	eor	v22.16b,v22.16b,v14.16b
 | |
| 	eor	v23.16b,v23.16b,v15.16b
 | |
| 	st1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
 | |
| 
 | |
| 	add	v27.4s,v27.4s,v0.4s			// += 8
 | |
| 	add	v28.4s,v28.4s,v0.4s
 | |
| 	add	v29.4s,v29.4s,v0.4s
 | |
| 	add	v30.4s,v30.4s,v0.4s
 | |
| 
 | |
| 	b.hs	Loop_outer_512_neon
 | |
| 
 | |
| 	adds	x2,x2,#512
 | |
| 	ushr	v0.4s,v31.4s,#2			// 4 -> 1
 | |
| 
 | |
| 	ldp	d8,d9,[sp,#128+0]		// meet ABI requirements
 | |
| 	ldp	d10,d11,[sp,#128+16]
 | |
| 	ldp	d12,d13,[sp,#128+32]
 | |
| 	ldp	d14,d15,[sp,#128+48]
 | |
| 
 | |
| 	stp	q24,q31,[sp,#0]		// wipe off-load area
 | |
| 	stp	q24,q31,[sp,#32]
 | |
| 	stp	q24,q31,[sp,#64]
 | |
| 
 | |
| 	b.eq	Ldone_512_neon
 | |
| 
 | |
| 	cmp	x2,#192
 | |
| 	sub	v27.4s,v27.4s,v0.4s			// -= 1
 | |
| 	sub	v28.4s,v28.4s,v0.4s
 | |
| 	sub	v29.4s,v29.4s,v0.4s
 | |
| 	add	sp,sp,#128
 | |
| 	b.hs	Loop_outer_neon
 | |
| 
 | |
| 	eor	v25.16b,v25.16b,v25.16b
 | |
| 	eor	v26.16b,v26.16b,v26.16b
 | |
| 	eor	v27.16b,v27.16b,v27.16b
 | |
| 	eor	v28.16b,v28.16b,v28.16b
 | |
| 	eor	v29.16b,v29.16b,v29.16b
 | |
| 	eor	v30.16b,v30.16b,v30.16b
 | |
| 	b	Loop_outer
 | |
| 
 | |
| Ldone_512_neon:
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	add	sp,sp,#128+64
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldp	x29,x30,[sp],#96
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| #endif  // !OPENSSL_NO_ASM
 |