321 lines
7.6 KiB
ArmAsm
321 lines
7.6 KiB
ArmAsm
// This file is generated from a similarly-named Perl script in the BoringSSL
|
|
// source tree. Do not edit by hand.
|
|
|
|
#if !defined(__has_feature)
|
|
#define __has_feature(x) 0
|
|
#endif
|
|
#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM)
|
|
#define OPENSSL_NO_ASM
|
|
#endif
|
|
|
|
#if !defined(OPENSSL_NO_ASM)
|
|
#if defined(__aarch64__)
|
|
#if defined(BORINGSSL_PREFIX)
|
|
#include <boringssl_prefix_symbols_asm.h>
|
|
#endif
|
|
#include "openssl/arm_arch.h"
|
|
|
|
.text
|
|
.globl beeu_mod_inverse_vartime
|
|
.hidden beeu_mod_inverse_vartime
|
|
.type beeu_mod_inverse_vartime, %function
|
|
.align 4
|
|
beeu_mod_inverse_vartime:
|
|
// Reserve enough space for 14 8-byte registers on the stack
|
|
// in the first stp call for x29, x30.
|
|
// Then store the remaining callee-saved registers.
|
|
//
|
|
// | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 |
|
|
// ^ ^
|
|
// sp <------------------- 112 bytes ----------------> old sp
|
|
// x29 (FP)
|
|
//
|
|
AARCH64_SIGN_LINK_REGISTER
|
|
stp x29,x30,[sp,#-112]!
|
|
add x29,sp,#0
|
|
stp x19,x20,[sp,#16]
|
|
stp x21,x22,[sp,#32]
|
|
stp x23,x24,[sp,#48]
|
|
stp x25,x26,[sp,#64]
|
|
stp x27,x28,[sp,#80]
|
|
stp x0,x2,[sp,#96]
|
|
|
|
// B = b3..b0 := a
|
|
ldp x25,x26,[x1]
|
|
ldp x27,x28,[x1,#16]
|
|
|
|
// n3..n0 := n
|
|
// Note: the value of input params are changed in the following.
|
|
ldp x0,x1,[x2]
|
|
ldp x2,x30,[x2,#16]
|
|
|
|
// A = a3..a0 := n
|
|
mov x21, x0
|
|
mov x22, x1
|
|
mov x23, x2
|
|
mov x24, x30
|
|
|
|
// X = x4..x0 := 1
|
|
mov x3, #1
|
|
eor x4, x4, x4
|
|
eor x5, x5, x5
|
|
eor x6, x6, x6
|
|
eor x7, x7, x7
|
|
|
|
// Y = y4..y0 := 0
|
|
eor x8, x8, x8
|
|
eor x9, x9, x9
|
|
eor x10, x10, x10
|
|
eor x11, x11, x11
|
|
eor x12, x12, x12
|
|
|
|
.Lbeeu_loop:
|
|
// if B == 0, jump to .Lbeeu_loop_end
|
|
orr x14, x25, x26
|
|
orr x14, x14, x27
|
|
|
|
// reverse the bit order of x25. This is needed for clz after this macro
|
|
rbit x15, x25
|
|
|
|
orr x14, x14, x28
|
|
cbz x14,.Lbeeu_loop_end
|
|
|
|
|
|
// 0 < B < |n|,
|
|
// 0 < A <= |n|,
|
|
// (1) X*a == B (mod |n|),
|
|
// (2) (-1)*Y*a == A (mod |n|)
|
|
|
|
// Now divide B by the maximum possible power of two in the
|
|
// integers, and divide X by the same value mod |n|.
|
|
// When we're done, (1) still holds.
|
|
|
|
// shift := number of trailing 0s in x25
|
|
// ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO)
|
|
clz x13, x15
|
|
|
|
// If there is no shift, goto shift_A_Y
|
|
cbz x13, .Lbeeu_shift_A_Y
|
|
|
|
// Shift B right by "x13" bits
|
|
neg x14, x13
|
|
lsr x25, x25, x13
|
|
lsl x15, x26, x14
|
|
|
|
lsr x26, x26, x13
|
|
lsl x19, x27, x14
|
|
|
|
orr x25, x25, x15
|
|
|
|
lsr x27, x27, x13
|
|
lsl x20, x28, x14
|
|
|
|
orr x26, x26, x19
|
|
|
|
lsr x28, x28, x13
|
|
|
|
orr x27, x27, x20
|
|
|
|
|
|
// Shift X right by "x13" bits, adding n whenever X becomes odd.
|
|
// x13--;
|
|
// x14 := 0; needed in the addition to the most significant word in SHIFT1
|
|
eor x14, x14, x14
|
|
.Lbeeu_shift_loop_X:
|
|
tbz x3, #0, .Lshift1_0
|
|
adds x3, x3, x0
|
|
adcs x4, x4, x1
|
|
adcs x5, x5, x2
|
|
adcs x6, x6, x30
|
|
adc x7, x7, x14
|
|
.Lshift1_0:
|
|
// var0 := [var1|var0]<64..1>;
|
|
// i.e. concatenate var1 and var0,
|
|
// extract bits <64..1> from the resulting 128-bit value
|
|
// and put them in var0
|
|
extr x3, x4, x3, #1
|
|
extr x4, x5, x4, #1
|
|
extr x5, x6, x5, #1
|
|
extr x6, x7, x6, #1
|
|
lsr x7, x7, #1
|
|
|
|
subs x13, x13, #1
|
|
bne .Lbeeu_shift_loop_X
|
|
|
|
// Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl
|
|
// with the following differences:
|
|
// - "x13" is set directly to the number of trailing 0s in B
|
|
// (using rbit and clz instructions)
|
|
// - The loop is only used to call SHIFT1(X)
|
|
// and x13 is decreased while executing the X loop.
|
|
// - SHIFT256(B, x13) is performed before right-shifting X; they are independent
|
|
|
|
.Lbeeu_shift_A_Y:
|
|
// Same for A and Y.
|
|
// Afterwards, (2) still holds.
|
|
// Reverse the bit order of x21
|
|
// x13 := number of trailing 0s in x21 (= number of leading 0s in x15)
|
|
rbit x15, x21
|
|
clz x13, x15
|
|
|
|
// If there is no shift, goto |B-A|, X+Y update
|
|
cbz x13, .Lbeeu_update_B_X_or_A_Y
|
|
|
|
// Shift A right by "x13" bits
|
|
neg x14, x13
|
|
lsr x21, x21, x13
|
|
lsl x15, x22, x14
|
|
|
|
lsr x22, x22, x13
|
|
lsl x19, x23, x14
|
|
|
|
orr x21, x21, x15
|
|
|
|
lsr x23, x23, x13
|
|
lsl x20, x24, x14
|
|
|
|
orr x22, x22, x19
|
|
|
|
lsr x24, x24, x13
|
|
|
|
orr x23, x23, x20
|
|
|
|
|
|
// Shift Y right by "x13" bits, adding n whenever Y becomes odd.
|
|
// x13--;
|
|
// x14 := 0; needed in the addition to the most significant word in SHIFT1
|
|
eor x14, x14, x14
|
|
.Lbeeu_shift_loop_Y:
|
|
tbz x8, #0, .Lshift1_1
|
|
adds x8, x8, x0
|
|
adcs x9, x9, x1
|
|
adcs x10, x10, x2
|
|
adcs x11, x11, x30
|
|
adc x12, x12, x14
|
|
.Lshift1_1:
|
|
// var0 := [var1|var0]<64..1>;
|
|
// i.e. concatenate var1 and var0,
|
|
// extract bits <64..1> from the resulting 128-bit value
|
|
// and put them in var0
|
|
extr x8, x9, x8, #1
|
|
extr x9, x10, x9, #1
|
|
extr x10, x11, x10, #1
|
|
extr x11, x12, x11, #1
|
|
lsr x12, x12, #1
|
|
|
|
subs x13, x13, #1
|
|
bne .Lbeeu_shift_loop_Y
|
|
|
|
.Lbeeu_update_B_X_or_A_Y:
|
|
// Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow)
|
|
// Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words
|
|
// without taking a sign bit if generated. The lack of a carry would
|
|
// indicate a negative result. See, for example,
|
|
// https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes
|
|
subs x14, x25, x21
|
|
sbcs x15, x26, x22
|
|
sbcs x19, x27, x23
|
|
sbcs x20, x28, x24
|
|
bcs .Lbeeu_B_greater_than_A
|
|
|
|
// Else A > B =>
|
|
// A := A - B; Y := Y + X; goto beginning of the loop
|
|
subs x21, x21, x25
|
|
sbcs x22, x22, x26
|
|
sbcs x23, x23, x27
|
|
sbcs x24, x24, x28
|
|
|
|
adds x8, x8, x3
|
|
adcs x9, x9, x4
|
|
adcs x10, x10, x5
|
|
adcs x11, x11, x6
|
|
adc x12, x12, x7
|
|
b .Lbeeu_loop
|
|
|
|
.Lbeeu_B_greater_than_A:
|
|
// Continue with B > A =>
|
|
// B := B - A; X := X + Y; goto beginning of the loop
|
|
mov x25, x14
|
|
mov x26, x15
|
|
mov x27, x19
|
|
mov x28, x20
|
|
|
|
adds x3, x3, x8
|
|
adcs x4, x4, x9
|
|
adcs x5, x5, x10
|
|
adcs x6, x6, x11
|
|
adc x7, x7, x12
|
|
b .Lbeeu_loop
|
|
|
|
.Lbeeu_loop_end:
|
|
// The Euclid's algorithm loop ends when A == gcd(a,n);
|
|
// this would be 1, when a and n are co-prime (i.e. do not have a common factor).
|
|
// Since (-1)*Y*a == A (mod |n|), Y>0
|
|
// then out = -Y mod n
|
|
|
|
// Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
|
|
// Is A-1 == 0?
|
|
// If not, fail.
|
|
sub x14, x21, #1
|
|
orr x14, x14, x22
|
|
orr x14, x14, x23
|
|
orr x14, x14, x24
|
|
cbnz x14, .Lbeeu_err
|
|
|
|
// If Y>n ==> Y:=Y-n
|
|
.Lbeeu_reduction_loop:
|
|
// x_i := y_i - n_i (X is no longer needed, use it as temp)
|
|
// (x14 = 0 from above)
|
|
subs x3, x8, x0
|
|
sbcs x4, x9, x1
|
|
sbcs x5, x10, x2
|
|
sbcs x6, x11, x30
|
|
sbcs x7, x12, x14
|
|
|
|
// If result is non-negative (i.e., cs = carry set = no borrow),
|
|
// y_i := x_i; goto reduce again
|
|
// else
|
|
// y_i := y_i; continue
|
|
csel x8, x3, x8, cs
|
|
csel x9, x4, x9, cs
|
|
csel x10, x5, x10, cs
|
|
csel x11, x6, x11, cs
|
|
csel x12, x7, x12, cs
|
|
bcs .Lbeeu_reduction_loop
|
|
|
|
// Now Y < n (Y cannot be equal to n, since the inverse cannot be 0)
|
|
// out = -Y = n-Y
|
|
subs x8, x0, x8
|
|
sbcs x9, x1, x9
|
|
sbcs x10, x2, x10
|
|
sbcs x11, x30, x11
|
|
|
|
// Save Y in output (out (x0) was saved on the stack)
|
|
ldr x3, [sp,#96]
|
|
stp x8, x9, [x3]
|
|
stp x10, x11, [x3,#16]
|
|
// return 1 (success)
|
|
mov x0, #1
|
|
b .Lbeeu_finish
|
|
|
|
.Lbeeu_err:
|
|
// return 0 (error)
|
|
eor x0, x0, x0
|
|
|
|
.Lbeeu_finish:
|
|
// Restore callee-saved registers, except x0, x2
|
|
add sp,x29,#0
|
|
ldp x19,x20,[sp,#16]
|
|
ldp x21,x22,[sp,#32]
|
|
ldp x23,x24,[sp,#48]
|
|
ldp x25,x26,[sp,#64]
|
|
ldp x27,x28,[sp,#80]
|
|
ldp x29,x30,[sp],#112
|
|
|
|
AARCH64_VALIDATE_LINK_REGISTER
|
|
ret
|
|
.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime
|
|
#endif
|
|
#endif // !OPENSSL_NO_ASM
|
|
.section .note.GNU-stack,"",%progbits
|