167 lines
4.8 KiB
ArmAsm
167 lines
4.8 KiB
ArmAsm
//.include "ihevc_neon_macros.s"
|
|
.macro push_v_regs
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp x19, x20, [sp, #-16]!
|
|
stp x21, x22, [sp, #-16]!
|
|
stp X29, X30, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp X29, X30, [sp], #16
|
|
ldp x21, x22, [sp], #16
|
|
ldp x19, x20, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
.endm
|
|
|
|
.text
|
|
.p2align 2
|
|
.global ixheaacd_scale_factor_process_armv8
|
|
|
|
ixheaacd_scale_factor_process_armv8:
|
|
|
|
push_v_regs
|
|
|
|
MOV x9, x4
|
|
|
|
MOV x21, x6
|
|
MOV x22, x7
|
|
CMP x2, #0 // Tbands
|
|
|
|
BGT lbl17
|
|
|
|
pop_v_regs
|
|
ret
|
|
lbl17:
|
|
MOV x10, #0
|
|
CMP x5, #2
|
|
BGT ADD_34
|
|
MOV x11, #0x25
|
|
B TBANDS_LOOP
|
|
ADD_34:
|
|
MOV x11, #0x22
|
|
// MOV x11, #0x25 // temp=37
|
|
|
|
TBANDS_LOOP:
|
|
LDRSH x5, [x1], #2 // scale_factor = *Scfactor++;
|
|
LDRB w4, [x3], #1 //Offset [1]
|
|
sxtw x4, w4
|
|
|
|
|
|
CMP x5, #0x18 //if(scale_factor < 24)
|
|
BGE SCALE_FACTOR_GE_12 //
|
|
|
|
CMP x4, #0
|
|
BLE OFFSET_ZERO
|
|
|
|
SCALE_FACTOR_LT_12:
|
|
|
|
STR x10, [x0], #8
|
|
STR x10, [x0], #8
|
|
SUBS x4, x4, #4
|
|
BGT SCALE_FACTOR_LT_12
|
|
B OFFSET_ZERO
|
|
|
|
SCALE_FACTOR_GE_12:
|
|
|
|
SUBS x6, x11, x5, ASR #2 // 37-(scale_factor >> 2)
|
|
AND x5, x5, #3 // scale_factor & 0x0003
|
|
|
|
//ADD x5,x9,x5,LSL #1 ; scale_table_ptr[(scale_factor & 0x0003)];
|
|
LDR w5, [x9, x5, LSL #2] // scale_short = scale_table_ptr[(scale_factor & 0x0003)];
|
|
sxtw x5, w5
|
|
AND w17, w5, #0x0000FFFF
|
|
sxth w17, w17 //16-bit value stored as 32-bit,so SMULWB can still be used
|
|
BLE SHIFT_LE_ZERO // if shift less than or equal to zero
|
|
|
|
SUB x14, x6, #1 //dont do that extra LSL #1 in SMULWB
|
|
|
|
SHIFT_POSITIVE: //loop over sfbWidth a multiple of 4
|
|
LDP w6, w7 , [x0, #0] // temp1 = *x_invquant
|
|
LDP w19, w20, [x0, #8]
|
|
|
|
//SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
|
|
SMULL x6, w6, w17
|
|
SMULL x7, w7, w17
|
|
SMULL x19, w19, w17
|
|
SMULL x20, w20, w17
|
|
|
|
ASR x6, x6, #16
|
|
ASR x7, x7 , #16
|
|
ASR x19, x19 , #16
|
|
ASR x20, x20 , #16
|
|
|
|
ASR x6, x6, x14 // buffex1 = shx32(buffex1, shift);
|
|
ASR x7, x7, x14
|
|
ASR x19, x19, x14
|
|
ASR x20, x20, x14
|
|
|
|
stp w6, w7, [x0], #8
|
|
stp w19, w20, [x0], #8
|
|
|
|
SUBS x4, x4, #4
|
|
|
|
BGT SHIFT_POSITIVE
|
|
B OFFSET_ZERO
|
|
SHIFT_LE_ZERO:
|
|
|
|
//RSBS x14, x6, #0 //-shift
|
|
NEGS x14, x6
|
|
BGT SHIFT_NEGTIVE1
|
|
|
|
SHIFT_ZERO: //loop over sfbWidth a multiple of 4
|
|
LDP w6, w7, [x0, #0] // temp1 = *x_invquant;
|
|
|
|
//SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
|
|
SMULL x6, w6, w17
|
|
SMULL x7, w7, w17
|
|
|
|
ASR x6, x6, #16
|
|
ASR x7, x7, #16
|
|
|
|
LSL x6, x6, #1
|
|
LSL x7, x7, #1
|
|
|
|
STP w6, w7, [x0], #8 // *x_invquant++ = buffex1;
|
|
|
|
SUBS x4, x4, #2
|
|
|
|
BGT SHIFT_ZERO
|
|
B OFFSET_ZERO
|
|
|
|
SHIFT_NEGTIVE1:
|
|
SUB x14, x14, #1
|
|
SHIFT_NEGTIVE: //;loop over sfbWidth a multiple of 4
|
|
|
|
LDP w6, w7, [x0, #0]
|
|
LSL w6, w6, w14 // buffex1 = shl32(buffex1, shift-1);
|
|
LSL w7, w7, w14 // buffex1 = shl32(buffex1, shift-1);
|
|
|
|
//SMULWB x6, x6, x5 // buffex1 = mult32x16in32(temp1, scale_short);
|
|
SMULL x6, w6, w17
|
|
SMULL x7, w7, w17
|
|
ASR x6, x6, #16
|
|
ASR x7, x7, #16
|
|
|
|
LSL x6, x6, #2 // shl for fixmul_32x16b and shl32(buffer,1)
|
|
LSL x7, x7, #2 // shl for fixmul_32x16b and shl32(buffer,1)
|
|
|
|
STP w6, w7, [x0], #8 // *x_invquant++ = buffex1;
|
|
|
|
SUBS x4, x4, #2
|
|
|
|
BGT SHIFT_NEGTIVE
|
|
|
|
OFFSET_ZERO:
|
|
SUBS x2, x2, #1
|
|
BGT TBANDS_LOOP
|
|
|
|
pop_v_regs
|
|
ret
|