1434 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			1434 lines
		
	
	
		
			30 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| // This file is generated from a similarly-named Perl script in the BoringSSL
 | |
| // source tree. Do not edit by hand.
 | |
| 
 | |
| #if !defined(__has_feature)
 | |
| #define __has_feature(x) 0
 | |
| #endif
 | |
| #if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
 | |
| #define OPENSSL_NO_ASM
 | |
| #endif
 | |
| 
 | |
| #if !defined(OPENSSL_NO_ASM)
 | |
| #if defined(BORINGSSL_PREFIX)
 | |
| #include <boringssl_prefix_symbols_asm.h>
 | |
| #endif
 | |
| #include <openssl/arm_arch.h>
 | |
| 
 | |
| .text
 | |
| 
 | |
| .globl	_bn_mul_mont
 | |
| .private_extern	_bn_mul_mont
 | |
| 
 | |
| .align	5
 | |
| _bn_mul_mont:
 | |
| 	AARCH64_SIGN_LINK_REGISTER
 | |
| 	tst	x5,#7
 | |
| 	b.eq	__bn_sqr8x_mont
 | |
| 	tst	x5,#3
 | |
| 	b.eq	__bn_mul4x_mont
 | |
| Lmul_mont:
 | |
| 	stp	x29,x30,[sp,#-64]!
 | |
| 	add	x29,sp,#0
 | |
| 	stp	x19,x20,[sp,#16]
 | |
| 	stp	x21,x22,[sp,#32]
 | |
| 	stp	x23,x24,[sp,#48]
 | |
| 
 | |
| 	ldr	x9,[x2],#8		// bp[0]
 | |
| 	sub	x22,sp,x5,lsl#3
 | |
| 	ldp	x7,x8,[x1],#16	// ap[0..1]
 | |
| 	lsl	x5,x5,#3
 | |
| 	ldr	x4,[x4]		// *n0
 | |
| 	and	x22,x22,#-16		// ABI says so
 | |
| 	ldp	x13,x14,[x3],#16	// np[0..1]
 | |
| 
 | |
| 	mul	x6,x7,x9		// ap[0]*bp[0]
 | |
| 	sub	x21,x5,#16		// j=num-2
 | |
| 	umulh	x7,x7,x9
 | |
| 	mul	x10,x8,x9		// ap[1]*bp[0]
 | |
| 	umulh	x11,x8,x9
 | |
| 
 | |
| 	mul	x15,x6,x4		// "tp[0]"*n0
 | |
| 	mov	sp,x22			// alloca
 | |
| 
 | |
| 	// (*)	mul	x12,x13,x15	// np[0]*m1
 | |
| 	umulh	x13,x13,x15
 | |
| 	mul	x16,x14,x15		// np[1]*m1
 | |
| 	// (*)	adds	x12,x12,x6	// discarded
 | |
| 	// (*)	As for removal of first multiplication and addition
 | |
| 	//	instructions. The outcome of first addition is
 | |
| 	//	guaranteed to be zero, which leaves two computationally
 | |
| 	//	significant outcomes: it either carries or not. Then
 | |
| 	//	question is when does it carry? Is there alternative
 | |
| 	//	way to deduce it? If you follow operations, you can
 | |
| 	//	observe that condition for carry is quite simple:
 | |
| 	//	x6 being non-zero. So that carry can be calculated
 | |
| 	//	by adding -1 to x6. That's what next instruction does.
 | |
| 	subs	xzr,x6,#1		// (*)
 | |
| 	umulh	x17,x14,x15
 | |
| 	adc	x13,x13,xzr
 | |
| 	cbz	x21,L1st_skip
 | |
| 
 | |
| L1st:
 | |
| 	ldr	x8,[x1],#8
 | |
| 	adds	x6,x10,x7
 | |
| 	sub	x21,x21,#8		// j--
 | |
| 	adc	x7,x11,xzr
 | |
| 
 | |
| 	ldr	x14,[x3],#8
 | |
| 	adds	x12,x16,x13
 | |
| 	mul	x10,x8,x9		// ap[j]*bp[0]
 | |
| 	adc	x13,x17,xzr
 | |
| 	umulh	x11,x8,x9
 | |
| 
 | |
| 	adds	x12,x12,x6
 | |
| 	mul	x16,x14,x15		// np[j]*m1
 | |
| 	adc	x13,x13,xzr
 | |
| 	umulh	x17,x14,x15
 | |
| 	str	x12,[x22],#8		// tp[j-1]
 | |
| 	cbnz	x21,L1st
 | |
| 
 | |
| L1st_skip:
 | |
| 	adds	x6,x10,x7
 | |
| 	sub	x1,x1,x5		// rewind x1
 | |
| 	adc	x7,x11,xzr
 | |
| 
 | |
| 	adds	x12,x16,x13
 | |
| 	sub	x3,x3,x5		// rewind x3
 | |
| 	adc	x13,x17,xzr
 | |
| 
 | |
| 	adds	x12,x12,x6
 | |
| 	sub	x20,x5,#8		// i=num-1
 | |
| 	adcs	x13,x13,x7
 | |
| 
 | |
| 	adc	x19,xzr,xzr		// upmost overflow bit
 | |
| 	stp	x12,x13,[x22]
 | |
| 
 | |
| Louter:
 | |
| 	ldr	x9,[x2],#8		// bp[i]
 | |
| 	ldp	x7,x8,[x1],#16
 | |
| 	ldr	x23,[sp]		// tp[0]
 | |
| 	add	x22,sp,#8
 | |
| 
 | |
| 	mul	x6,x7,x9		// ap[0]*bp[i]
 | |
| 	sub	x21,x5,#16		// j=num-2
 | |
| 	umulh	x7,x7,x9
 | |
| 	ldp	x13,x14,[x3],#16
 | |
| 	mul	x10,x8,x9		// ap[1]*bp[i]
 | |
| 	adds	x6,x6,x23
 | |
| 	umulh	x11,x8,x9
 | |
| 	adc	x7,x7,xzr
 | |
| 
 | |
| 	mul	x15,x6,x4
 | |
| 	sub	x20,x20,#8		// i--
 | |
| 
 | |
| 	// (*)	mul	x12,x13,x15	// np[0]*m1
 | |
| 	umulh	x13,x13,x15
 | |
| 	mul	x16,x14,x15		// np[1]*m1
 | |
| 	// (*)	adds	x12,x12,x6
 | |
| 	subs	xzr,x6,#1		// (*)
 | |
| 	umulh	x17,x14,x15
 | |
| 	cbz	x21,Linner_skip
 | |
| 
 | |
| Linner:
 | |
| 	ldr	x8,[x1],#8
 | |
| 	adc	x13,x13,xzr
 | |
| 	ldr	x23,[x22],#8		// tp[j]
 | |
| 	adds	x6,x10,x7
 | |
| 	sub	x21,x21,#8		// j--
 | |
| 	adc	x7,x11,xzr
 | |
| 
 | |
| 	adds	x12,x16,x13
 | |
| 	ldr	x14,[x3],#8
 | |
| 	adc	x13,x17,xzr
 | |
| 
 | |
| 	mul	x10,x8,x9		// ap[j]*bp[i]
 | |
| 	adds	x6,x6,x23
 | |
| 	umulh	x11,x8,x9
 | |
| 	adc	x7,x7,xzr
 | |
| 
 | |
| 	mul	x16,x14,x15		// np[j]*m1
 | |
| 	adds	x12,x12,x6
 | |
| 	umulh	x17,x14,x15
 | |
| 	str	x12,[x22,#-16]		// tp[j-1]
 | |
| 	cbnz	x21,Linner
 | |
| 
 | |
| Linner_skip:
 | |
| 	ldr	x23,[x22],#8		// tp[j]
 | |
| 	adc	x13,x13,xzr
 | |
| 	adds	x6,x10,x7
 | |
| 	sub	x1,x1,x5		// rewind x1
 | |
| 	adc	x7,x11,xzr
 | |
| 
 | |
| 	adds	x12,x16,x13
 | |
| 	sub	x3,x3,x5		// rewind x3
 | |
| 	adcs	x13,x17,x19
 | |
| 	adc	x19,xzr,xzr
 | |
| 
 | |
| 	adds	x6,x6,x23
 | |
| 	adc	x7,x7,xzr
 | |
| 
 | |
| 	adds	x12,x12,x6
 | |
| 	adcs	x13,x13,x7
 | |
| 	adc	x19,x19,xzr		// upmost overflow bit
 | |
| 	stp	x12,x13,[x22,#-16]
 | |
| 
 | |
| 	cbnz	x20,Louter
 | |
| 
 | |
| 	// Final step. We see if result is larger than modulus, and
 | |
| 	// if it is, subtract the modulus. But comparison implies
 | |
| 	// subtraction. So we subtract modulus, see if it borrowed,
 | |
| 	// and conditionally copy original value.
 | |
| 	ldr	x23,[sp]		// tp[0]
 | |
| 	add	x22,sp,#8
 | |
| 	ldr	x14,[x3],#8		// np[0]
 | |
| 	subs	x21,x5,#8		// j=num-1 and clear borrow
 | |
| 	mov	x1,x0
 | |
| Lsub:
 | |
| 	sbcs	x8,x23,x14		// tp[j]-np[j]
 | |
| 	ldr	x23,[x22],#8
 | |
| 	sub	x21,x21,#8		// j--
 | |
| 	ldr	x14,[x3],#8
 | |
| 	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
 | |
| 	cbnz	x21,Lsub
 | |
| 
 | |
| 	sbcs	x8,x23,x14
 | |
| 	sbcs	x19,x19,xzr		// did it borrow?
 | |
| 	str	x8,[x1],#8		// rp[num-1]
 | |
| 
 | |
| 	ldr	x23,[sp]		// tp[0]
 | |
| 	add	x22,sp,#8
 | |
| 	ldr	x8,[x0],#8		// rp[0]
 | |
| 	sub	x5,x5,#8		// num--
 | |
| 	nop
 | |
| Lcond_copy:
 | |
| 	sub	x5,x5,#8		// num--
 | |
| 	csel	x14,x23,x8,lo		// did it borrow?
 | |
| 	ldr	x23,[x22],#8
 | |
| 	ldr	x8,[x0],#8
 | |
| 	str	xzr,[x22,#-16]		// wipe tp
 | |
| 	str	x14,[x0,#-16]
 | |
| 	cbnz	x5,Lcond_copy
 | |
| 
 | |
| 	csel	x14,x23,x8,lo
 | |
| 	str	xzr,[x22,#-8]		// wipe tp
 | |
| 	str	x14,[x0,#-8]
 | |
| 
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	mov	sp,x29
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	mov	x0,#1
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldr	x29,[sp],#64
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| 
 | |
| .align	5
 | |
| __bn_sqr8x_mont:
 | |
| 	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
 | |
| 	// only from bn_mul_mont which has already signed the return address.
 | |
| 	cmp	x1,x2
 | |
| 	b.ne	__bn_mul4x_mont
 | |
| Lsqr8x_mont:
 | |
| 	stp	x29,x30,[sp,#-128]!
 | |
| 	add	x29,sp,#0
 | |
| 	stp	x19,x20,[sp,#16]
 | |
| 	stp	x21,x22,[sp,#32]
 | |
| 	stp	x23,x24,[sp,#48]
 | |
| 	stp	x25,x26,[sp,#64]
 | |
| 	stp	x27,x28,[sp,#80]
 | |
| 	stp	x0,x3,[sp,#96]	// offload rp and np
 | |
| 
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 
 | |
| 	sub	x2,sp,x5,lsl#4
 | |
| 	lsl	x5,x5,#3
 | |
| 	ldr	x4,[x4]		// *n0
 | |
| 	mov	sp,x2			// alloca
 | |
| 	sub	x27,x5,#8*8
 | |
| 	b	Lsqr8x_zero_start
 | |
| 
 | |
| Lsqr8x_zero:
 | |
| 	sub	x27,x27,#8*8
 | |
| 	stp	xzr,xzr,[x2,#8*0]
 | |
| 	stp	xzr,xzr,[x2,#8*2]
 | |
| 	stp	xzr,xzr,[x2,#8*4]
 | |
| 	stp	xzr,xzr,[x2,#8*6]
 | |
| Lsqr8x_zero_start:
 | |
| 	stp	xzr,xzr,[x2,#8*8]
 | |
| 	stp	xzr,xzr,[x2,#8*10]
 | |
| 	stp	xzr,xzr,[x2,#8*12]
 | |
| 	stp	xzr,xzr,[x2,#8*14]
 | |
| 	add	x2,x2,#8*16
 | |
| 	cbnz	x27,Lsqr8x_zero
 | |
| 
 | |
| 	add	x3,x1,x5
 | |
| 	add	x1,x1,#8*8
 | |
| 	mov	x19,xzr
 | |
| 	mov	x20,xzr
 | |
| 	mov	x21,xzr
 | |
| 	mov	x22,xzr
 | |
| 	mov	x23,xzr
 | |
| 	mov	x24,xzr
 | |
| 	mov	x25,xzr
 | |
| 	mov	x26,xzr
 | |
| 	mov	x2,sp
 | |
| 	str	x4,[x29,#112]		// offload n0
 | |
| 
 | |
| 	// Multiply everything but a[i]*a[i]
 | |
| .align	4
 | |
| Lsqr8x_outer_loop:
 | |
|         //                                                 a[1]a[0]	(i)
 | |
|         //                                             a[2]a[0]
 | |
|         //                                         a[3]a[0]
 | |
|         //                                     a[4]a[0]
 | |
|         //                                 a[5]a[0]
 | |
|         //                             a[6]a[0]
 | |
|         //                         a[7]a[0]
 | |
|         //                                         a[2]a[1]		(ii)
 | |
|         //                                     a[3]a[1]
 | |
|         //                                 a[4]a[1]
 | |
|         //                             a[5]a[1]
 | |
|         //                         a[6]a[1]
 | |
|         //                     a[7]a[1]
 | |
|         //                                 a[3]a[2]			(iii)
 | |
|         //                             a[4]a[2]
 | |
|         //                         a[5]a[2]
 | |
|         //                     a[6]a[2]
 | |
|         //                 a[7]a[2]
 | |
|         //                         a[4]a[3]				(iv)
 | |
|         //                     a[5]a[3]
 | |
|         //                 a[6]a[3]
 | |
|         //             a[7]a[3]
 | |
|         //                 a[5]a[4]					(v)
 | |
|         //             a[6]a[4]
 | |
|         //         a[7]a[4]
 | |
|         //         a[6]a[5]						(vi)
 | |
|         //     a[7]a[5]
 | |
|         // a[7]a[6]							(vii)
 | |
| 
 | |
| 	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
 | |
| 	mul	x15,x8,x6
 | |
| 	mul	x16,x9,x6
 | |
| 	mul	x17,x10,x6
 | |
| 	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
 | |
| 	mul	x14,x11,x6
 | |
| 	adcs	x21,x21,x15
 | |
| 	mul	x15,x12,x6
 | |
| 	adcs	x22,x22,x16
 | |
| 	mul	x16,x13,x6
 | |
| 	adcs	x23,x23,x17
 | |
| 	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
 | |
| 	adcs	x24,x24,x14
 | |
| 	umulh	x14,x8,x6
 | |
| 	adcs	x25,x25,x15
 | |
| 	umulh	x15,x9,x6
 | |
| 	adcs	x26,x26,x16
 | |
| 	umulh	x16,x10,x6
 | |
| 	stp	x19,x20,[x2],#8*2	// t[0..1]
 | |
| 	adc	x19,xzr,xzr		// t[8]
 | |
| 	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
 | |
| 	umulh	x17,x11,x6
 | |
| 	adcs	x22,x22,x14
 | |
| 	umulh	x14,x12,x6
 | |
| 	adcs	x23,x23,x15
 | |
| 	umulh	x15,x13,x6
 | |
| 	adcs	x24,x24,x16
 | |
| 	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
 | |
| 	adcs	x25,x25,x17
 | |
| 	mul	x17,x9,x7
 | |
| 	adcs	x26,x26,x14
 | |
| 	mul	x14,x10,x7
 | |
| 	adc	x19,x19,x15
 | |
| 
 | |
| 	mul	x15,x11,x7
 | |
| 	adds	x22,x22,x16
 | |
| 	mul	x16,x12,x7
 | |
| 	adcs	x23,x23,x17
 | |
| 	mul	x17,x13,x7
 | |
| 	adcs	x24,x24,x14
 | |
| 	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
 | |
| 	adcs	x25,x25,x15
 | |
| 	umulh	x15,x9,x7
 | |
| 	adcs	x26,x26,x16
 | |
| 	umulh	x16,x10,x7
 | |
| 	adcs	x19,x19,x17
 | |
| 	umulh	x17,x11,x7
 | |
| 	stp	x21,x22,[x2],#8*2	// t[2..3]
 | |
| 	adc	x20,xzr,xzr		// t[9]
 | |
| 	adds	x23,x23,x14
 | |
| 	umulh	x14,x12,x7
 | |
| 	adcs	x24,x24,x15
 | |
| 	umulh	x15,x13,x7
 | |
| 	adcs	x25,x25,x16
 | |
| 	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
 | |
| 	adcs	x26,x26,x17
 | |
| 	mul	x17,x10,x8
 | |
| 	adcs	x19,x19,x14
 | |
| 	mul	x14,x11,x8
 | |
| 	adc	x20,x20,x15
 | |
| 
 | |
| 	mul	x15,x12,x8
 | |
| 	adds	x24,x24,x16
 | |
| 	mul	x16,x13,x8
 | |
| 	adcs	x25,x25,x17
 | |
| 	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
 | |
| 	adcs	x26,x26,x14
 | |
| 	umulh	x14,x10,x8
 | |
| 	adcs	x19,x19,x15
 | |
| 	umulh	x15,x11,x8
 | |
| 	adcs	x20,x20,x16
 | |
| 	umulh	x16,x12,x8
 | |
| 	stp	x23,x24,[x2],#8*2	// t[4..5]
 | |
| 	adc	x21,xzr,xzr		// t[10]
 | |
| 	adds	x25,x25,x17
 | |
| 	umulh	x17,x13,x8
 | |
| 	adcs	x26,x26,x14
 | |
| 	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
 | |
| 	adcs	x19,x19,x15
 | |
| 	mul	x15,x11,x9
 | |
| 	adcs	x20,x20,x16
 | |
| 	mul	x16,x12,x9
 | |
| 	adc	x21,x21,x17
 | |
| 
 | |
| 	mul	x17,x13,x9
 | |
| 	adds	x26,x26,x14
 | |
| 	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
 | |
| 	adcs	x19,x19,x15
 | |
| 	umulh	x15,x11,x9
 | |
| 	adcs	x20,x20,x16
 | |
| 	umulh	x16,x12,x9
 | |
| 	adcs	x21,x21,x17
 | |
| 	umulh	x17,x13,x9
 | |
| 	stp	x25,x26,[x2],#8*2	// t[6..7]
 | |
| 	adc	x22,xzr,xzr		// t[11]
 | |
| 	adds	x19,x19,x14
 | |
| 	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
 | |
| 	adcs	x20,x20,x15
 | |
| 	mul	x15,x12,x10
 | |
| 	adcs	x21,x21,x16
 | |
| 	mul	x16,x13,x10
 | |
| 	adc	x22,x22,x17
 | |
| 
 | |
| 	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
 | |
| 	adds	x20,x20,x14
 | |
| 	umulh	x14,x12,x10
 | |
| 	adcs	x21,x21,x15
 | |
| 	umulh	x15,x13,x10
 | |
| 	adcs	x22,x22,x16
 | |
| 	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
 | |
| 	adc	x23,xzr,xzr		// t[12]
 | |
| 	adds	x21,x21,x17
 | |
| 	mul	x17,x13,x11
 | |
| 	adcs	x22,x22,x14
 | |
| 	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
 | |
| 	adc	x23,x23,x15
 | |
| 
 | |
| 	umulh	x15,x13,x11
 | |
| 	adds	x22,x22,x16
 | |
| 	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
 | |
| 	adcs	x23,x23,x17
 | |
| 	umulh	x17,x13,x12		// hi(a[7]*a[6])
 | |
| 	adc	x24,xzr,xzr		// t[13]
 | |
| 	adds	x23,x23,x14
 | |
| 	sub	x27,x3,x1	// done yet?
 | |
| 	adc	x24,x24,x15
 | |
| 
 | |
| 	adds	x24,x24,x16
 | |
| 	sub	x14,x3,x5	// rewinded ap
 | |
| 	adc	x25,xzr,xzr		// t[14]
 | |
| 	add	x25,x25,x17
 | |
| 
 | |
| 	cbz	x27,Lsqr8x_outer_break
 | |
| 
 | |
| 	mov	x4,x6
 | |
| 	ldp	x6,x7,[x2,#8*0]
 | |
| 	ldp	x8,x9,[x2,#8*2]
 | |
| 	ldp	x10,x11,[x2,#8*4]
 | |
| 	ldp	x12,x13,[x2,#8*6]
 | |
| 	adds	x19,x19,x6
 | |
| 	adcs	x20,x20,x7
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	adcs	x21,x21,x8
 | |
| 	adcs	x22,x22,x9
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	adcs	x23,x23,x10
 | |
| 	adcs	x24,x24,x11
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 	adcs	x25,x25,x12
 | |
| 	mov	x0,x1
 | |
| 	adcs	x26,xzr,x13
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 	add	x1,x1,#8*8
 | |
| 	//adc	x28,xzr,xzr		// moved below
 | |
| 	mov	x27,#-8*8
 | |
| 
 | |
| 	//                                                         a[8]a[0]
 | |
| 	//                                                     a[9]a[0]
 | |
| 	//                                                 a[a]a[0]
 | |
| 	//                                             a[b]a[0]
 | |
| 	//                                         a[c]a[0]
 | |
| 	//                                     a[d]a[0]
 | |
| 	//                                 a[e]a[0]
 | |
| 	//                             a[f]a[0]
 | |
| 	//                                                     a[8]a[1]
 | |
| 	//                         a[f]a[1]........................
 | |
| 	//                                                 a[8]a[2]
 | |
| 	//                     a[f]a[2]........................
 | |
| 	//                                             a[8]a[3]
 | |
| 	//                 a[f]a[3]........................
 | |
| 	//                                         a[8]a[4]
 | |
| 	//             a[f]a[4]........................
 | |
| 	//                                     a[8]a[5]
 | |
| 	//         a[f]a[5]........................
 | |
| 	//                                 a[8]a[6]
 | |
| 	//     a[f]a[6]........................
 | |
| 	//                             a[8]a[7]
 | |
| 	// a[f]a[7]........................
 | |
| Lsqr8x_mul:
 | |
| 	mul	x14,x6,x4
 | |
| 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
 | |
| 	mul	x15,x7,x4
 | |
| 	add	x27,x27,#8
 | |
| 	mul	x16,x8,x4
 | |
| 	mul	x17,x9,x4
 | |
| 	adds	x19,x19,x14
 | |
| 	mul	x14,x10,x4
 | |
| 	adcs	x20,x20,x15
 | |
| 	mul	x15,x11,x4
 | |
| 	adcs	x21,x21,x16
 | |
| 	mul	x16,x12,x4
 | |
| 	adcs	x22,x22,x17
 | |
| 	mul	x17,x13,x4
 | |
| 	adcs	x23,x23,x14
 | |
| 	umulh	x14,x6,x4
 | |
| 	adcs	x24,x24,x15
 | |
| 	umulh	x15,x7,x4
 | |
| 	adcs	x25,x25,x16
 | |
| 	umulh	x16,x8,x4
 | |
| 	adcs	x26,x26,x17
 | |
| 	umulh	x17,x9,x4
 | |
| 	adc	x28,x28,xzr
 | |
| 	str	x19,[x2],#8
 | |
| 	adds	x19,x20,x14
 | |
| 	umulh	x14,x10,x4
 | |
| 	adcs	x20,x21,x15
 | |
| 	umulh	x15,x11,x4
 | |
| 	adcs	x21,x22,x16
 | |
| 	umulh	x16,x12,x4
 | |
| 	adcs	x22,x23,x17
 | |
| 	umulh	x17,x13,x4
 | |
| 	ldr	x4,[x0,x27]
 | |
| 	adcs	x23,x24,x14
 | |
| 	adcs	x24,x25,x15
 | |
| 	adcs	x25,x26,x16
 | |
| 	adcs	x26,x28,x17
 | |
| 	//adc	x28,xzr,xzr		// moved above
 | |
| 	cbnz	x27,Lsqr8x_mul
 | |
| 					// note that carry flag is guaranteed
 | |
| 					// to be zero at this point
 | |
| 	cmp	x1,x3		// done yet?
 | |
| 	b.eq	Lsqr8x_break
 | |
| 
 | |
| 	ldp	x6,x7,[x2,#8*0]
 | |
| 	ldp	x8,x9,[x2,#8*2]
 | |
| 	ldp	x10,x11,[x2,#8*4]
 | |
| 	ldp	x12,x13,[x2,#8*6]
 | |
| 	adds	x19,x19,x6
 | |
| 	ldr	x4,[x0,#-8*8]
 | |
| 	adcs	x20,x20,x7
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	adcs	x21,x21,x8
 | |
| 	adcs	x22,x22,x9
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	adcs	x23,x23,x10
 | |
| 	adcs	x24,x24,x11
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 	adcs	x25,x25,x12
 | |
| 	mov	x27,#-8*8
 | |
| 	adcs	x26,x26,x13
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 	add	x1,x1,#8*8
 | |
| 	//adc	x28,xzr,xzr		// moved above
 | |
| 	b	Lsqr8x_mul
 | |
| 
 | |
| .align	4
 | |
| Lsqr8x_break:
 | |
| 	ldp	x6,x7,[x0,#8*0]
 | |
| 	add	x1,x0,#8*8
 | |
| 	ldp	x8,x9,[x0,#8*2]
 | |
| 	sub	x14,x3,x1		// is it last iteration?
 | |
| 	ldp	x10,x11,[x0,#8*4]
 | |
| 	sub	x15,x2,x14
 | |
| 	ldp	x12,x13,[x0,#8*6]
 | |
| 	cbz	x14,Lsqr8x_outer_loop
 | |
| 
 | |
| 	stp	x19,x20,[x2,#8*0]
 | |
| 	ldp	x19,x20,[x15,#8*0]
 | |
| 	stp	x21,x22,[x2,#8*2]
 | |
| 	ldp	x21,x22,[x15,#8*2]
 | |
| 	stp	x23,x24,[x2,#8*4]
 | |
| 	ldp	x23,x24,[x15,#8*4]
 | |
| 	stp	x25,x26,[x2,#8*6]
 | |
| 	mov	x2,x15
 | |
| 	ldp	x25,x26,[x15,#8*6]
 | |
| 	b	Lsqr8x_outer_loop
 | |
| 
 | |
| .align	4
 | |
| Lsqr8x_outer_break:
 | |
| 	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
 | |
| 	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
 | |
| 	ldp	x15,x16,[sp,#8*1]
 | |
| 	ldp	x11,x13,[x14,#8*2]
 | |
| 	add	x1,x14,#8*4
 | |
| 	ldp	x17,x14,[sp,#8*3]
 | |
| 
 | |
| 	stp	x19,x20,[x2,#8*0]
 | |
| 	mul	x19,x7,x7
 | |
| 	stp	x21,x22,[x2,#8*2]
 | |
| 	umulh	x7,x7,x7
 | |
| 	stp	x23,x24,[x2,#8*4]
 | |
| 	mul	x8,x9,x9
 | |
| 	stp	x25,x26,[x2,#8*6]
 | |
| 	mov	x2,sp
 | |
| 	umulh	x9,x9,x9
 | |
| 	adds	x20,x7,x15,lsl#1
 | |
| 	extr	x15,x16,x15,#63
 | |
| 	sub	x27,x5,#8*4
 | |
| 
 | |
| Lsqr4x_shift_n_add:
 | |
| 	adcs	x21,x8,x15
 | |
| 	extr	x16,x17,x16,#63
 | |
| 	sub	x27,x27,#8*4
 | |
| 	adcs	x22,x9,x16
 | |
| 	ldp	x15,x16,[x2,#8*5]
 | |
| 	mul	x10,x11,x11
 | |
| 	ldp	x7,x9,[x1],#8*2
 | |
| 	umulh	x11,x11,x11
 | |
| 	mul	x12,x13,x13
 | |
| 	umulh	x13,x13,x13
 | |
| 	extr	x17,x14,x17,#63
 | |
| 	stp	x19,x20,[x2,#8*0]
 | |
| 	adcs	x23,x10,x17
 | |
| 	extr	x14,x15,x14,#63
 | |
| 	stp	x21,x22,[x2,#8*2]
 | |
| 	adcs	x24,x11,x14
 | |
| 	ldp	x17,x14,[x2,#8*7]
 | |
| 	extr	x15,x16,x15,#63
 | |
| 	adcs	x25,x12,x15
 | |
| 	extr	x16,x17,x16,#63
 | |
| 	adcs	x26,x13,x16
 | |
| 	ldp	x15,x16,[x2,#8*9]
 | |
| 	mul	x6,x7,x7
 | |
| 	ldp	x11,x13,[x1],#8*2
 | |
| 	umulh	x7,x7,x7
 | |
| 	mul	x8,x9,x9
 | |
| 	umulh	x9,x9,x9
 | |
| 	stp	x23,x24,[x2,#8*4]
 | |
| 	extr	x17,x14,x17,#63
 | |
| 	stp	x25,x26,[x2,#8*6]
 | |
| 	add	x2,x2,#8*8
 | |
| 	adcs	x19,x6,x17
 | |
| 	extr	x14,x15,x14,#63
 | |
| 	adcs	x20,x7,x14
 | |
| 	ldp	x17,x14,[x2,#8*3]
 | |
| 	extr	x15,x16,x15,#63
 | |
| 	cbnz	x27,Lsqr4x_shift_n_add
 | |
| 	ldp	x1,x4,[x29,#104]	// pull np and n0
 | |
| 
 | |
| 	adcs	x21,x8,x15
 | |
| 	extr	x16,x17,x16,#63
 | |
| 	adcs	x22,x9,x16
 | |
| 	ldp	x15,x16,[x2,#8*5]
 | |
| 	mul	x10,x11,x11
 | |
| 	umulh	x11,x11,x11
 | |
| 	stp	x19,x20,[x2,#8*0]
 | |
| 	mul	x12,x13,x13
 | |
| 	umulh	x13,x13,x13
 | |
| 	stp	x21,x22,[x2,#8*2]
 | |
| 	extr	x17,x14,x17,#63
 | |
| 	adcs	x23,x10,x17
 | |
| 	extr	x14,x15,x14,#63
 | |
| 	ldp	x19,x20,[sp,#8*0]
 | |
| 	adcs	x24,x11,x14
 | |
| 	extr	x15,x16,x15,#63
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	adcs	x25,x12,x15
 | |
| 	extr	x16,xzr,x16,#63
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	adc	x26,x13,x16
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 
 | |
| 	// Reduce by 512 bits per iteration
 | |
| 	mul	x28,x4,x19		// t[0]*n0
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 	add	x3,x1,x5
 | |
| 	ldp	x21,x22,[sp,#8*2]
 | |
| 	stp	x23,x24,[x2,#8*4]
 | |
| 	ldp	x23,x24,[sp,#8*4]
 | |
| 	stp	x25,x26,[x2,#8*6]
 | |
| 	ldp	x25,x26,[sp,#8*6]
 | |
| 	add	x1,x1,#8*8
 | |
| 	mov	x30,xzr		// initial top-most carry
 | |
| 	mov	x2,sp
 | |
| 	mov	x27,#8
 | |
| 
 | |
| Lsqr8x_reduction:
 | |
| 	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
 | |
| 	mul	x15,x7,x28
 | |
| 	sub	x27,x27,#1
 | |
| 	mul	x16,x8,x28
 | |
| 	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
 | |
| 	mul	x17,x9,x28
 | |
| 	// (*)	adds	xzr,x19,x14
 | |
| 	subs	xzr,x19,#1		// (*)
 | |
| 	mul	x14,x10,x28
 | |
| 	adcs	x19,x20,x15
 | |
| 	mul	x15,x11,x28
 | |
| 	adcs	x20,x21,x16
 | |
| 	mul	x16,x12,x28
 | |
| 	adcs	x21,x22,x17
 | |
| 	mul	x17,x13,x28
 | |
| 	adcs	x22,x23,x14
 | |
| 	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
 | |
| 	adcs	x23,x24,x15
 | |
| 	umulh	x15,x7,x28
 | |
| 	adcs	x24,x25,x16
 | |
| 	umulh	x16,x8,x28
 | |
| 	adcs	x25,x26,x17
 | |
| 	umulh	x17,x9,x28
 | |
| 	adc	x26,xzr,xzr
 | |
| 	adds	x19,x19,x14
 | |
| 	umulh	x14,x10,x28
 | |
| 	adcs	x20,x20,x15
 | |
| 	umulh	x15,x11,x28
 | |
| 	adcs	x21,x21,x16
 | |
| 	umulh	x16,x12,x28
 | |
| 	adcs	x22,x22,x17
 | |
| 	umulh	x17,x13,x28
 | |
| 	mul	x28,x4,x19		// next t[0]*n0
 | |
| 	adcs	x23,x23,x14
 | |
| 	adcs	x24,x24,x15
 | |
| 	adcs	x25,x25,x16
 | |
| 	adc	x26,x26,x17
 | |
| 	cbnz	x27,Lsqr8x_reduction
 | |
| 
 | |
| 	ldp	x14,x15,[x2,#8*0]
 | |
| 	ldp	x16,x17,[x2,#8*2]
 | |
| 	mov	x0,x2
 | |
| 	sub	x27,x3,x1	// done yet?
 | |
| 	adds	x19,x19,x14
 | |
| 	adcs	x20,x20,x15
 | |
| 	ldp	x14,x15,[x2,#8*4]
 | |
| 	adcs	x21,x21,x16
 | |
| 	adcs	x22,x22,x17
 | |
| 	ldp	x16,x17,[x2,#8*6]
 | |
| 	adcs	x23,x23,x14
 | |
| 	adcs	x24,x24,x15
 | |
| 	adcs	x25,x25,x16
 | |
| 	adcs	x26,x26,x17
 | |
| 	//adc	x28,xzr,xzr		// moved below
 | |
| 	cbz	x27,Lsqr8x8_post_condition
 | |
| 
 | |
| 	ldr	x4,[x2,#-8*8]
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 	mov	x27,#-8*8
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 	add	x1,x1,#8*8
 | |
| 
 | |
| Lsqr8x_tail:
 | |
| 	mul	x14,x6,x4
 | |
| 	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
 | |
| 	mul	x15,x7,x4
 | |
| 	add	x27,x27,#8
 | |
| 	mul	x16,x8,x4
 | |
| 	mul	x17,x9,x4
 | |
| 	adds	x19,x19,x14
 | |
| 	mul	x14,x10,x4
 | |
| 	adcs	x20,x20,x15
 | |
| 	mul	x15,x11,x4
 | |
| 	adcs	x21,x21,x16
 | |
| 	mul	x16,x12,x4
 | |
| 	adcs	x22,x22,x17
 | |
| 	mul	x17,x13,x4
 | |
| 	adcs	x23,x23,x14
 | |
| 	umulh	x14,x6,x4
 | |
| 	adcs	x24,x24,x15
 | |
| 	umulh	x15,x7,x4
 | |
| 	adcs	x25,x25,x16
 | |
| 	umulh	x16,x8,x4
 | |
| 	adcs	x26,x26,x17
 | |
| 	umulh	x17,x9,x4
 | |
| 	adc	x28,x28,xzr
 | |
| 	str	x19,[x2],#8
 | |
| 	adds	x19,x20,x14
 | |
| 	umulh	x14,x10,x4
 | |
| 	adcs	x20,x21,x15
 | |
| 	umulh	x15,x11,x4
 | |
| 	adcs	x21,x22,x16
 | |
| 	umulh	x16,x12,x4
 | |
| 	adcs	x22,x23,x17
 | |
| 	umulh	x17,x13,x4
 | |
| 	ldr	x4,[x0,x27]
 | |
| 	adcs	x23,x24,x14
 | |
| 	adcs	x24,x25,x15
 | |
| 	adcs	x25,x26,x16
 | |
| 	adcs	x26,x28,x17
 | |
| 	//adc	x28,xzr,xzr		// moved above
 | |
| 	cbnz	x27,Lsqr8x_tail
 | |
| 					// note that carry flag is guaranteed
 | |
| 					// to be zero at this point
 | |
| 	ldp	x6,x7,[x2,#8*0]
 | |
| 	sub	x27,x3,x1	// done yet?
 | |
| 	sub	x16,x3,x5	// rewinded np
 | |
| 	ldp	x8,x9,[x2,#8*2]
 | |
| 	ldp	x10,x11,[x2,#8*4]
 | |
| 	ldp	x12,x13,[x2,#8*6]
 | |
| 	cbz	x27,Lsqr8x_tail_break
 | |
| 
 | |
| 	ldr	x4,[x0,#-8*8]
 | |
| 	adds	x19,x19,x6
 | |
| 	adcs	x20,x20,x7
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	adcs	x21,x21,x8
 | |
| 	adcs	x22,x22,x9
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	adcs	x23,x23,x10
 | |
| 	adcs	x24,x24,x11
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 	adcs	x25,x25,x12
 | |
| 	mov	x27,#-8*8
 | |
| 	adcs	x26,x26,x13
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 	add	x1,x1,#8*8
 | |
| 	//adc	x28,xzr,xzr		// moved above
 | |
| 	b	Lsqr8x_tail
 | |
| 
 | |
| .align	4
 | |
| Lsqr8x_tail_break:
 | |
| 	ldr	x4,[x29,#112]		// pull n0
 | |
| 	add	x27,x2,#8*8		// end of current t[num] window
 | |
| 
 | |
| 	subs	xzr,x30,#1		// "move" top-most carry to carry bit
 | |
| 	adcs	x14,x19,x6
 | |
| 	adcs	x15,x20,x7
 | |
| 	ldp	x19,x20,[x0,#8*0]
 | |
| 	adcs	x21,x21,x8
 | |
| 	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
 | |
| 	adcs	x22,x22,x9
 | |
| 	ldp	x8,x9,[x16,#8*2]
 | |
| 	adcs	x23,x23,x10
 | |
| 	adcs	x24,x24,x11
 | |
| 	ldp	x10,x11,[x16,#8*4]
 | |
| 	adcs	x25,x25,x12
 | |
| 	adcs	x26,x26,x13
 | |
| 	ldp	x12,x13,[x16,#8*6]
 | |
| 	add	x1,x16,#8*8
 | |
| 	adc	x30,xzr,xzr	// top-most carry
 | |
| 	mul	x28,x4,x19
 | |
| 	stp	x14,x15,[x2,#8*0]
 | |
| 	stp	x21,x22,[x2,#8*2]
 | |
| 	ldp	x21,x22,[x0,#8*2]
 | |
| 	stp	x23,x24,[x2,#8*4]
 | |
| 	ldp	x23,x24,[x0,#8*4]
 | |
| 	cmp	x27,x29		// did we hit the bottom?
 | |
| 	stp	x25,x26,[x2,#8*6]
 | |
| 	mov	x2,x0			// slide the window
 | |
| 	ldp	x25,x26,[x0,#8*6]
 | |
| 	mov	x27,#8
 | |
| 	b.ne	Lsqr8x_reduction
 | |
| 
 | |
| 	// Final step. We see if result is larger than modulus, and
 | |
| 	// if it is, subtract the modulus. But comparison implies
 | |
| 	// subtraction. So we subtract modulus, see if it borrowed,
 | |
| 	// and conditionally copy original value.
 | |
| 	ldr	x0,[x29,#96]		// pull rp
 | |
| 	add	x2,x2,#8*8
 | |
| 	subs	x14,x19,x6
 | |
| 	sbcs	x15,x20,x7
 | |
| 	sub	x27,x5,#8*8
 | |
| 	mov	x3,x0		// x0 copy
 | |
| 
 | |
| Lsqr8x_sub:
 | |
| 	sbcs	x16,x21,x8
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	sbcs	x17,x22,x9
 | |
| 	stp	x14,x15,[x0,#8*0]
 | |
| 	sbcs	x14,x23,x10
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	sbcs	x15,x24,x11
 | |
| 	stp	x16,x17,[x0,#8*2]
 | |
| 	sbcs	x16,x25,x12
 | |
| 	ldp	x10,x11,[x1,#8*4]
 | |
| 	sbcs	x17,x26,x13
 | |
| 	ldp	x12,x13,[x1,#8*6]
 | |
| 	add	x1,x1,#8*8
 | |
| 	ldp	x19,x20,[x2,#8*0]
 | |
| 	sub	x27,x27,#8*8
 | |
| 	ldp	x21,x22,[x2,#8*2]
 | |
| 	ldp	x23,x24,[x2,#8*4]
 | |
| 	ldp	x25,x26,[x2,#8*6]
 | |
| 	add	x2,x2,#8*8
 | |
| 	stp	x14,x15,[x0,#8*4]
 | |
| 	sbcs	x14,x19,x6
 | |
| 	stp	x16,x17,[x0,#8*6]
 | |
| 	add	x0,x0,#8*8
 | |
| 	sbcs	x15,x20,x7
 | |
| 	cbnz	x27,Lsqr8x_sub
 | |
| 
 | |
| 	sbcs	x16,x21,x8
 | |
| 	mov	x2,sp
 | |
| 	add	x1,sp,x5
 | |
| 	ldp	x6,x7,[x3,#8*0]
 | |
| 	sbcs	x17,x22,x9
 | |
| 	stp	x14,x15,[x0,#8*0]
 | |
| 	sbcs	x14,x23,x10
 | |
| 	ldp	x8,x9,[x3,#8*2]
 | |
| 	sbcs	x15,x24,x11
 | |
| 	stp	x16,x17,[x0,#8*2]
 | |
| 	sbcs	x16,x25,x12
 | |
| 	ldp	x19,x20,[x1,#8*0]
 | |
| 	sbcs	x17,x26,x13
 | |
| 	ldp	x21,x22,[x1,#8*2]
 | |
| 	sbcs	xzr,x30,xzr	// did it borrow?
 | |
| 	ldr	x30,[x29,#8]		// pull return address
 | |
| 	stp	x14,x15,[x0,#8*4]
 | |
| 	stp	x16,x17,[x0,#8*6]
 | |
| 
 | |
| 	sub	x27,x5,#8*4
 | |
| Lsqr4x_cond_copy:
 | |
| 	sub	x27,x27,#8*4
 | |
| 	csel	x14,x19,x6,lo
 | |
| 	stp	xzr,xzr,[x2,#8*0]
 | |
| 	csel	x15,x20,x7,lo
 | |
| 	ldp	x6,x7,[x3,#8*4]
 | |
| 	ldp	x19,x20,[x1,#8*4]
 | |
| 	csel	x16,x21,x8,lo
 | |
| 	stp	xzr,xzr,[x2,#8*2]
 | |
| 	add	x2,x2,#8*4
 | |
| 	csel	x17,x22,x9,lo
 | |
| 	ldp	x8,x9,[x3,#8*6]
 | |
| 	ldp	x21,x22,[x1,#8*6]
 | |
| 	add	x1,x1,#8*4
 | |
| 	stp	x14,x15,[x3,#8*0]
 | |
| 	stp	x16,x17,[x3,#8*2]
 | |
| 	add	x3,x3,#8*4
 | |
| 	stp	xzr,xzr,[x1,#8*0]
 | |
| 	stp	xzr,xzr,[x1,#8*2]
 | |
| 	cbnz	x27,Lsqr4x_cond_copy
 | |
| 
 | |
| 	csel	x14,x19,x6,lo
 | |
| 	stp	xzr,xzr,[x2,#8*0]
 | |
| 	csel	x15,x20,x7,lo
 | |
| 	stp	xzr,xzr,[x2,#8*2]
 | |
| 	csel	x16,x21,x8,lo
 | |
| 	csel	x17,x22,x9,lo
 | |
| 	stp	x14,x15,[x3,#8*0]
 | |
| 	stp	x16,x17,[x3,#8*2]
 | |
| 
 | |
| 	b	Lsqr8x_done
 | |
| 
 | |
| .align	4
 | |
| Lsqr8x8_post_condition:
 | |
| 	adc	x28,xzr,xzr
 | |
| 	ldr	x30,[x29,#8]		// pull return address
 | |
| 	// x19-7,x28 hold result, x6-7 hold modulus
 | |
| 	subs	x6,x19,x6
 | |
| 	ldr	x1,[x29,#96]		// pull rp
 | |
| 	sbcs	x7,x20,x7
 | |
| 	stp	xzr,xzr,[sp,#8*0]
 | |
| 	sbcs	x8,x21,x8
 | |
| 	stp	xzr,xzr,[sp,#8*2]
 | |
| 	sbcs	x9,x22,x9
 | |
| 	stp	xzr,xzr,[sp,#8*4]
 | |
| 	sbcs	x10,x23,x10
 | |
| 	stp	xzr,xzr,[sp,#8*6]
 | |
| 	sbcs	x11,x24,x11
 | |
| 	stp	xzr,xzr,[sp,#8*8]
 | |
| 	sbcs	x12,x25,x12
 | |
| 	stp	xzr,xzr,[sp,#8*10]
 | |
| 	sbcs	x13,x26,x13
 | |
| 	stp	xzr,xzr,[sp,#8*12]
 | |
| 	sbcs	x28,x28,xzr	// did it borrow?
 | |
| 	stp	xzr,xzr,[sp,#8*14]
 | |
| 
 | |
| 	// x6-7 hold result-modulus
 | |
| 	csel	x6,x19,x6,lo
 | |
| 	csel	x7,x20,x7,lo
 | |
| 	csel	x8,x21,x8,lo
 | |
| 	csel	x9,x22,x9,lo
 | |
| 	stp	x6,x7,[x1,#8*0]
 | |
| 	csel	x10,x23,x10,lo
 | |
| 	csel	x11,x24,x11,lo
 | |
| 	stp	x8,x9,[x1,#8*2]
 | |
| 	csel	x12,x25,x12,lo
 | |
| 	csel	x13,x26,x13,lo
 | |
| 	stp	x10,x11,[x1,#8*4]
 | |
| 	stp	x12,x13,[x1,#8*6]
 | |
| 
 | |
| Lsqr8x_done:
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	mov	sp,x29
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	mov	x0,#1
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldr	x29,[sp],#128
 | |
| 	// x30 is popped earlier
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| 
 | |
| .align	5
 | |
| __bn_mul4x_mont:
 | |
| 	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
 | |
| 	// only from bn_mul_mont or __bn_mul8x_mont which have already signed the
 | |
| 	// return address.
 | |
| 	stp	x29,x30,[sp,#-128]!
 | |
| 	add	x29,sp,#0
 | |
| 	stp	x19,x20,[sp,#16]
 | |
| 	stp	x21,x22,[sp,#32]
 | |
| 	stp	x23,x24,[sp,#48]
 | |
| 	stp	x25,x26,[sp,#64]
 | |
| 	stp	x27,x28,[sp,#80]
 | |
| 
 | |
| 	sub	x26,sp,x5,lsl#3
 | |
| 	lsl	x5,x5,#3
 | |
| 	ldr	x4,[x4]		// *n0
 | |
| 	sub	sp,x26,#8*4		// alloca
 | |
| 
 | |
| 	add	x10,x2,x5
 | |
| 	add	x27,x1,x5
 | |
| 	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
 | |
| 
 | |
| 	ldr	x24,[x2,#8*0]		// b[0]
 | |
| 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	add	x1,x1,#8*4
 | |
| 	mov	x19,xzr
 | |
| 	mov	x20,xzr
 | |
| 	mov	x21,xzr
 | |
| 	mov	x22,xzr
 | |
| 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	adds	x3,x3,#8*4		// clear carry bit
 | |
| 	mov	x0,xzr
 | |
| 	mov	x28,#0
 | |
| 	mov	x26,sp
 | |
| 
 | |
| Loop_mul4x_1st_reduction:
 | |
| 	mul	x10,x6,x24		// lo(a[0..3]*b[0])
 | |
| 	adc	x0,x0,xzr	// modulo-scheduled
 | |
| 	mul	x11,x7,x24
 | |
| 	add	x28,x28,#8
 | |
| 	mul	x12,x8,x24
 | |
| 	and	x28,x28,#31
 | |
| 	mul	x13,x9,x24
 | |
| 	adds	x19,x19,x10
 | |
| 	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
 | |
| 	adcs	x20,x20,x11
 | |
| 	mul	x25,x19,x4		// t[0]*n0
 | |
| 	adcs	x21,x21,x12
 | |
| 	umulh	x11,x7,x24
 | |
| 	adcs	x22,x22,x13
 | |
| 	umulh	x12,x8,x24
 | |
| 	adc	x23,xzr,xzr
 | |
| 	umulh	x13,x9,x24
 | |
| 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
 | |
| 	adds	x20,x20,x10
 | |
| 	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
 | |
| 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
 | |
| 	adcs	x21,x21,x11
 | |
| 	mul	x11,x15,x25
 | |
| 	adcs	x22,x22,x12
 | |
| 	mul	x12,x16,x25
 | |
| 	adc	x23,x23,x13		// can't overflow
 | |
| 	mul	x13,x17,x25
 | |
| 	// (*)	adds	xzr,x19,x10
 | |
| 	subs	xzr,x19,#1		// (*)
 | |
| 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
 | |
| 	adcs	x19,x20,x11
 | |
| 	umulh	x11,x15,x25
 | |
| 	adcs	x20,x21,x12
 | |
| 	umulh	x12,x16,x25
 | |
| 	adcs	x21,x22,x13
 | |
| 	umulh	x13,x17,x25
 | |
| 	adcs	x22,x23,x0
 | |
| 	adc	x0,xzr,xzr
 | |
| 	adds	x19,x19,x10
 | |
| 	sub	x10,x27,x1
 | |
| 	adcs	x20,x20,x11
 | |
| 	adcs	x21,x21,x12
 | |
| 	adcs	x22,x22,x13
 | |
| 	//adc	x0,x0,xzr
 | |
| 	cbnz	x28,Loop_mul4x_1st_reduction
 | |
| 
 | |
| 	cbz	x10,Lmul4x4_post_condition
 | |
| 
 | |
| 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	add	x1,x1,#8*4
 | |
| 	ldr	x25,[sp]		// a[0]*n0
 | |
| 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	add	x3,x3,#8*4
 | |
| 
 | |
| Loop_mul4x_1st_tail:
 | |
| 	mul	x10,x6,x24		// lo(a[4..7]*b[i])
 | |
| 	adc	x0,x0,xzr	// modulo-scheduled
 | |
| 	mul	x11,x7,x24
 | |
| 	add	x28,x28,#8
 | |
| 	mul	x12,x8,x24
 | |
| 	and	x28,x28,#31
 | |
| 	mul	x13,x9,x24
 | |
| 	adds	x19,x19,x10
 | |
| 	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
 | |
| 	adcs	x20,x20,x11
 | |
| 	umulh	x11,x7,x24
 | |
| 	adcs	x21,x21,x12
 | |
| 	umulh	x12,x8,x24
 | |
| 	adcs	x22,x22,x13
 | |
| 	umulh	x13,x9,x24
 | |
| 	adc	x23,xzr,xzr
 | |
| 	ldr	x24,[x2,x28]		// next b[i] (or b[0])
 | |
| 	adds	x20,x20,x10
 | |
| 	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
 | |
| 	adcs	x21,x21,x11
 | |
| 	mul	x11,x15,x25
 | |
| 	adcs	x22,x22,x12
 | |
| 	mul	x12,x16,x25
 | |
| 	adc	x23,x23,x13		// can't overflow
 | |
| 	mul	x13,x17,x25
 | |
| 	adds	x19,x19,x10
 | |
| 	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
 | |
| 	adcs	x20,x20,x11
 | |
| 	umulh	x11,x15,x25
 | |
| 	adcs	x21,x21,x12
 | |
| 	umulh	x12,x16,x25
 | |
| 	adcs	x22,x22,x13
 | |
| 	adcs	x23,x23,x0
 | |
| 	umulh	x13,x17,x25
 | |
| 	adc	x0,xzr,xzr
 | |
| 	ldr	x25,[sp,x28]		// next t[0]*n0
 | |
| 	str	x19,[x26],#8		// result!!!
 | |
| 	adds	x19,x20,x10
 | |
| 	sub	x10,x27,x1		// done yet?
 | |
| 	adcs	x20,x21,x11
 | |
| 	adcs	x21,x22,x12
 | |
| 	adcs	x22,x23,x13
 | |
| 	//adc	x0,x0,xzr
 | |
| 	cbnz	x28,Loop_mul4x_1st_tail
 | |
| 
 | |
| 	sub	x11,x27,x5	// rewinded x1
 | |
| 	cbz	x10,Lmul4x_proceed
 | |
| 
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	add	x1,x1,#8*4
 | |
| 	ldp	x14,x15,[x3,#8*0]
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	add	x3,x3,#8*4
 | |
| 	b	Loop_mul4x_1st_tail
 | |
| 
 | |
| .align	5
 | |
| Lmul4x_proceed:
 | |
| 	ldr	x24,[x2,#8*4]!		// *++b
 | |
| 	adc	x30,x0,xzr
 | |
| 	ldp	x6,x7,[x11,#8*0]	// a[0..3]
 | |
| 	sub	x3,x3,x5		// rewind np
 | |
| 	ldp	x8,x9,[x11,#8*2]
 | |
| 	add	x1,x11,#8*4
 | |
| 
 | |
| 	stp	x19,x20,[x26,#8*0]	// result!!!
 | |
| 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
 | |
| 	stp	x21,x22,[x26,#8*2]	// result!!!
 | |
| 	ldp	x21,x22,[sp,#8*6]
 | |
| 
 | |
| 	ldp	x14,x15,[x3,#8*0]	// n[0..3]
 | |
| 	mov	x26,sp
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	adds	x3,x3,#8*4		// clear carry bit
 | |
| 	mov	x0,xzr
 | |
| 
 | |
| .align	4
 | |
| Loop_mul4x_reduction:
 | |
| 	mul	x10,x6,x24		// lo(a[0..3]*b[4])
 | |
| 	adc	x0,x0,xzr	// modulo-scheduled
 | |
| 	mul	x11,x7,x24
 | |
| 	add	x28,x28,#8
 | |
| 	mul	x12,x8,x24
 | |
| 	and	x28,x28,#31
 | |
| 	mul	x13,x9,x24
 | |
| 	adds	x19,x19,x10
 | |
| 	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
 | |
| 	adcs	x20,x20,x11
 | |
| 	mul	x25,x19,x4		// t[0]*n0
 | |
| 	adcs	x21,x21,x12
 | |
| 	umulh	x11,x7,x24
 | |
| 	adcs	x22,x22,x13
 | |
| 	umulh	x12,x8,x24
 | |
| 	adc	x23,xzr,xzr
 | |
| 	umulh	x13,x9,x24
 | |
| 	ldr	x24,[x2,x28]		// next b[i]
 | |
| 	adds	x20,x20,x10
 | |
| 	// (*)	mul	x10,x14,x25
 | |
| 	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
 | |
| 	adcs	x21,x21,x11
 | |
| 	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
 | |
| 	adcs	x22,x22,x12
 | |
| 	mul	x12,x16,x25
 | |
| 	adc	x23,x23,x13		// can't overflow
 | |
| 	mul	x13,x17,x25
 | |
| 	// (*)	adds	xzr,x19,x10
 | |
| 	subs	xzr,x19,#1		// (*)
 | |
| 	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
 | |
| 	adcs	x19,x20,x11
 | |
| 	umulh	x11,x15,x25
 | |
| 	adcs	x20,x21,x12
 | |
| 	umulh	x12,x16,x25
 | |
| 	adcs	x21,x22,x13
 | |
| 	umulh	x13,x17,x25
 | |
| 	adcs	x22,x23,x0
 | |
| 	adc	x0,xzr,xzr
 | |
| 	adds	x19,x19,x10
 | |
| 	adcs	x20,x20,x11
 | |
| 	adcs	x21,x21,x12
 | |
| 	adcs	x22,x22,x13
 | |
| 	//adc	x0,x0,xzr
 | |
| 	cbnz	x28,Loop_mul4x_reduction
 | |
| 
 | |
| 	adc	x0,x0,xzr
 | |
| 	ldp	x10,x11,[x26,#8*4]	// t[4..7]
 | |
| 	ldp	x12,x13,[x26,#8*6]
 | |
| 	ldp	x6,x7,[x1,#8*0]	// a[4..7]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	add	x1,x1,#8*4
 | |
| 	adds	x19,x19,x10
 | |
| 	adcs	x20,x20,x11
 | |
| 	adcs	x21,x21,x12
 | |
| 	adcs	x22,x22,x13
 | |
| 	//adc	x0,x0,xzr
 | |
| 
 | |
| 	ldr	x25,[sp]		// t[0]*n0
 | |
| 	ldp	x14,x15,[x3,#8*0]	// n[4..7]
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	add	x3,x3,#8*4
 | |
| 
 | |
| .align	4
 | |
| Loop_mul4x_tail:
 | |
| 	mul	x10,x6,x24		// lo(a[4..7]*b[4])
 | |
| 	adc	x0,x0,xzr	// modulo-scheduled
 | |
| 	mul	x11,x7,x24
 | |
| 	add	x28,x28,#8
 | |
| 	mul	x12,x8,x24
 | |
| 	and	x28,x28,#31
 | |
| 	mul	x13,x9,x24
 | |
| 	adds	x19,x19,x10
 | |
| 	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
 | |
| 	adcs	x20,x20,x11
 | |
| 	umulh	x11,x7,x24
 | |
| 	adcs	x21,x21,x12
 | |
| 	umulh	x12,x8,x24
 | |
| 	adcs	x22,x22,x13
 | |
| 	umulh	x13,x9,x24
 | |
| 	adc	x23,xzr,xzr
 | |
| 	ldr	x24,[x2,x28]		// next b[i]
 | |
| 	adds	x20,x20,x10
 | |
| 	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
 | |
| 	adcs	x21,x21,x11
 | |
| 	mul	x11,x15,x25
 | |
| 	adcs	x22,x22,x12
 | |
| 	mul	x12,x16,x25
 | |
| 	adc	x23,x23,x13		// can't overflow
 | |
| 	mul	x13,x17,x25
 | |
| 	adds	x19,x19,x10
 | |
| 	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
 | |
| 	adcs	x20,x20,x11
 | |
| 	umulh	x11,x15,x25
 | |
| 	adcs	x21,x21,x12
 | |
| 	umulh	x12,x16,x25
 | |
| 	adcs	x22,x22,x13
 | |
| 	umulh	x13,x17,x25
 | |
| 	adcs	x23,x23,x0
 | |
| 	ldr	x25,[sp,x28]		// next a[0]*n0
 | |
| 	adc	x0,xzr,xzr
 | |
| 	str	x19,[x26],#8		// result!!!
 | |
| 	adds	x19,x20,x10
 | |
| 	sub	x10,x27,x1		// done yet?
 | |
| 	adcs	x20,x21,x11
 | |
| 	adcs	x21,x22,x12
 | |
| 	adcs	x22,x23,x13
 | |
| 	//adc	x0,x0,xzr
 | |
| 	cbnz	x28,Loop_mul4x_tail
 | |
| 
 | |
| 	sub	x11,x3,x5		// rewinded np?
 | |
| 	adc	x0,x0,xzr
 | |
| 	cbz	x10,Loop_mul4x_break
 | |
| 
 | |
| 	ldp	x10,x11,[x26,#8*4]
 | |
| 	ldp	x12,x13,[x26,#8*6]
 | |
| 	ldp	x6,x7,[x1,#8*0]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	add	x1,x1,#8*4
 | |
| 	adds	x19,x19,x10
 | |
| 	adcs	x20,x20,x11
 | |
| 	adcs	x21,x21,x12
 | |
| 	adcs	x22,x22,x13
 | |
| 	//adc	x0,x0,xzr
 | |
| 	ldp	x14,x15,[x3,#8*0]
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	add	x3,x3,#8*4
 | |
| 	b	Loop_mul4x_tail
 | |
| 
 | |
| .align	4
 | |
| Loop_mul4x_break:
 | |
| 	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
 | |
| 	adds	x19,x19,x30
 | |
| 	add	x2,x2,#8*4		// bp++
 | |
| 	adcs	x20,x20,xzr
 | |
| 	sub	x1,x1,x5		// rewind ap
 | |
| 	adcs	x21,x21,xzr
 | |
| 	stp	x19,x20,[x26,#8*0]	// result!!!
 | |
| 	adcs	x22,x22,xzr
 | |
| 	ldp	x19,x20,[sp,#8*4]	// t[0..3]
 | |
| 	adc	x30,x0,xzr
 | |
| 	stp	x21,x22,[x26,#8*2]	// result!!!
 | |
| 	cmp	x2,x13			// done yet?
 | |
| 	ldp	x21,x22,[sp,#8*6]
 | |
| 	ldp	x14,x15,[x11,#8*0]	// n[0..3]
 | |
| 	ldp	x16,x17,[x11,#8*2]
 | |
| 	add	x3,x11,#8*4
 | |
| 	b.eq	Lmul4x_post
 | |
| 
 | |
| 	ldr	x24,[x2]
 | |
| 	ldp	x6,x7,[x1,#8*0]	// a[0..3]
 | |
| 	ldp	x8,x9,[x1,#8*2]
 | |
| 	adds	x1,x1,#8*4		// clear carry bit
 | |
| 	mov	x0,xzr
 | |
| 	mov	x26,sp
 | |
| 	b	Loop_mul4x_reduction
 | |
| 
 | |
| .align	4
 | |
| Lmul4x_post:
 | |
| 	// Final step. We see if result is larger than modulus, and
 | |
| 	// if it is, subtract the modulus. But comparison implies
 | |
| 	// subtraction. So we subtract modulus, see if it borrowed,
 | |
| 	// and conditionally copy original value.
 | |
| 	mov	x0,x12
 | |
| 	mov	x27,x12		// x0 copy
 | |
| 	subs	x10,x19,x14
 | |
| 	add	x26,sp,#8*8
 | |
| 	sbcs	x11,x20,x15
 | |
| 	sub	x28,x5,#8*4
 | |
| 
 | |
| Lmul4x_sub:
 | |
| 	sbcs	x12,x21,x16
 | |
| 	ldp	x14,x15,[x3,#8*0]
 | |
| 	sub	x28,x28,#8*4
 | |
| 	ldp	x19,x20,[x26,#8*0]
 | |
| 	sbcs	x13,x22,x17
 | |
| 	ldp	x16,x17,[x3,#8*2]
 | |
| 	add	x3,x3,#8*4
 | |
| 	ldp	x21,x22,[x26,#8*2]
 | |
| 	add	x26,x26,#8*4
 | |
| 	stp	x10,x11,[x0,#8*0]
 | |
| 	sbcs	x10,x19,x14
 | |
| 	stp	x12,x13,[x0,#8*2]
 | |
| 	add	x0,x0,#8*4
 | |
| 	sbcs	x11,x20,x15
 | |
| 	cbnz	x28,Lmul4x_sub
 | |
| 
 | |
| 	sbcs	x12,x21,x16
 | |
| 	mov	x26,sp
 | |
| 	add	x1,sp,#8*4
 | |
| 	ldp	x6,x7,[x27,#8*0]
 | |
| 	sbcs	x13,x22,x17
 | |
| 	stp	x10,x11,[x0,#8*0]
 | |
| 	ldp	x8,x9,[x27,#8*2]
 | |
| 	stp	x12,x13,[x0,#8*2]
 | |
| 	ldp	x19,x20,[x1,#8*0]
 | |
| 	ldp	x21,x22,[x1,#8*2]
 | |
| 	sbcs	xzr,x30,xzr	// did it borrow?
 | |
| 	ldr	x30,[x29,#8]		// pull return address
 | |
| 
 | |
| 	sub	x28,x5,#8*4
 | |
| Lmul4x_cond_copy:
 | |
| 	sub	x28,x28,#8*4
 | |
| 	csel	x10,x19,x6,lo
 | |
| 	stp	xzr,xzr,[x26,#8*0]
 | |
| 	csel	x11,x20,x7,lo
 | |
| 	ldp	x6,x7,[x27,#8*4]
 | |
| 	ldp	x19,x20,[x1,#8*4]
 | |
| 	csel	x12,x21,x8,lo
 | |
| 	stp	xzr,xzr,[x26,#8*2]
 | |
| 	add	x26,x26,#8*4
 | |
| 	csel	x13,x22,x9,lo
 | |
| 	ldp	x8,x9,[x27,#8*6]
 | |
| 	ldp	x21,x22,[x1,#8*6]
 | |
| 	add	x1,x1,#8*4
 | |
| 	stp	x10,x11,[x27,#8*0]
 | |
| 	stp	x12,x13,[x27,#8*2]
 | |
| 	add	x27,x27,#8*4
 | |
| 	cbnz	x28,Lmul4x_cond_copy
 | |
| 
 | |
| 	csel	x10,x19,x6,lo
 | |
| 	stp	xzr,xzr,[x26,#8*0]
 | |
| 	csel	x11,x20,x7,lo
 | |
| 	stp	xzr,xzr,[x26,#8*2]
 | |
| 	csel	x12,x21,x8,lo
 | |
| 	stp	xzr,xzr,[x26,#8*3]
 | |
| 	csel	x13,x22,x9,lo
 | |
| 	stp	xzr,xzr,[x26,#8*4]
 | |
| 	stp	x10,x11,[x27,#8*0]
 | |
| 	stp	x12,x13,[x27,#8*2]
 | |
| 
 | |
| 	b	Lmul4x_done
 | |
| 
 | |
| .align	4
 | |
| Lmul4x4_post_condition:
 | |
| 	adc	x0,x0,xzr
 | |
| 	ldr	x1,[x29,#96]		// pull rp
 | |
| 	// x19-3,x0 hold result, x14-7 hold modulus
 | |
| 	subs	x6,x19,x14
 | |
| 	ldr	x30,[x29,#8]		// pull return address
 | |
| 	sbcs	x7,x20,x15
 | |
| 	stp	xzr,xzr,[sp,#8*0]
 | |
| 	sbcs	x8,x21,x16
 | |
| 	stp	xzr,xzr,[sp,#8*2]
 | |
| 	sbcs	x9,x22,x17
 | |
| 	stp	xzr,xzr,[sp,#8*4]
 | |
| 	sbcs	xzr,x0,xzr		// did it borrow?
 | |
| 	stp	xzr,xzr,[sp,#8*6]
 | |
| 
 | |
| 	// x6-3 hold result-modulus
 | |
| 	csel	x6,x19,x6,lo
 | |
| 	csel	x7,x20,x7,lo
 | |
| 	csel	x8,x21,x8,lo
 | |
| 	csel	x9,x22,x9,lo
 | |
| 	stp	x6,x7,[x1,#8*0]
 | |
| 	stp	x8,x9,[x1,#8*2]
 | |
| 
 | |
| Lmul4x_done:
 | |
| 	ldp	x19,x20,[x29,#16]
 | |
| 	mov	sp,x29
 | |
| 	ldp	x21,x22,[x29,#32]
 | |
| 	mov	x0,#1
 | |
| 	ldp	x23,x24,[x29,#48]
 | |
| 	ldp	x25,x26,[x29,#64]
 | |
| 	ldp	x27,x28,[x29,#80]
 | |
| 	ldr	x29,[sp],#128
 | |
| 	// x30 is popped earlier
 | |
| 	AARCH64_VALIDATE_LINK_REGISTER
 | |
| 	ret
 | |
| 
 | |
| .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
 | |
| .align	2
 | |
| .align	4
 | |
| #endif  // !OPENSSL_NO_ASM
 |