214 lines
6.2 KiB
ArmAsm
214 lines
6.2 KiB
ArmAsm
.macro push_v_regs
|
|
stp q8, q9, [sp, #-32]!
|
|
stp q10, q11, [sp, #-32]!
|
|
stp q12, q13, [sp, #-32]!
|
|
stp q14, q15, [sp, #-32]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X18, X19, [sp, #-16]!
|
|
stp X20, X21, [sp, #-16]!
|
|
stp X29, X30, [sp, #-16]!
|
|
.endm
|
|
|
|
.macro pop_v_regs
|
|
ldp X29, X30, [sp], #16
|
|
ldp X20, X21, [sp], #16
|
|
ldp X18, X19, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
ldp q14, q15, [sp], #32
|
|
ldp q12, q13, [sp], #32
|
|
ldp q10, q11, [sp], #32
|
|
ldp q8, q9, [sp], #32
|
|
.endm
|
|
|
|
.text
|
|
.p2align 2
|
|
|
|
.global ixheaacd_cos_sin_mod_loop2
|
|
ixheaacd_cos_sin_mod_loop2:
|
|
|
|
// STMFD sp!, {x4-x12, x14}
|
|
push_v_regs
|
|
//stp x19, x20,[sp,#-16]!
|
|
//VPUSH {D8-D15}
|
|
//generating load addresses
|
|
ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1];
|
|
SUB x3, x3, #4
|
|
ADD x10, x0, #256
|
|
ADD x11, x10, x2, LSL #3
|
|
SUB x11, x11, #4
|
|
MOV x8, #-4
|
|
MOV w19, #0
|
|
DUP V0.4s, w19
|
|
DUP V1.4s, w19
|
|
|
|
LDR w6, [x0]
|
|
sxtw x6, w6
|
|
ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1);
|
|
SUB x4, x4, #1
|
|
|
|
ASR x6, x6, #1 //*psubband = *psubband >> 1;
|
|
LD1 {v2.s}[0], [x3]
|
|
|
|
STR w6, [x0], #4 //psubband++;
|
|
sxtw x6, w6
|
|
LDR w7, [x0]
|
|
sxtw x7, w7
|
|
ASR x7, x7, #1
|
|
sub x20, x7, #0
|
|
neg x6, x20
|
|
STR w6, [x3], #-4
|
|
sxtw x6, w6
|
|
LD1 {v3.s}[0], [x3] // im = *psubband1;
|
|
|
|
LD2 {v0.h, v1.h}[0], [x1], #4
|
|
sxtl v0.4s, v0.4h
|
|
sxtl v1.4s, v1.4h
|
|
dup v0.2s, v0.s[0]
|
|
dup v1.2s, v1.s[0]
|
|
|
|
LD1 {v2.s}[1], [x11] //re = *psubband12;
|
|
|
|
// LDR w6, [x10]
|
|
// sxtw x6,w6
|
|
// ASR x7, x6, #1
|
|
// MOV x9, #0
|
|
// QSUB x7, x9, x7
|
|
LD1 {v4.s}[0], [x10]
|
|
SSHR v4.2s, v4.2s, #1
|
|
MOV x9, #0
|
|
DUP v6.2s, w9
|
|
SQSUB v4.2s, v6.2s, v4.2s
|
|
|
|
ST1 {v4.s}[0], [x11]
|
|
// str X7, [X11]
|
|
SUB x11, x11, #4
|
|
// sxtw x7,w7
|
|
|
|
LDR w6, [x10, #4]
|
|
sxtw x6, w6
|
|
ASR x6, x6, #1
|
|
STR w6, [x10], #4
|
|
sxtw x6, w6
|
|
|
|
LD1 {v3.s}[1], [x11]
|
|
|
|
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
|
|
sshr v4.2d, v4.2d, #16
|
|
sMULL v6.2d, v0.2s, v3.2s //add 2nd
|
|
sshr v6.2d, v6.2d, #16
|
|
sMULL v8.2d, v1.2s, v2.2s //add 1st
|
|
sshr v8.2d, v8.2d, #16
|
|
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
|
|
sshr v10.2d, v10.2d, #16
|
|
|
|
add v12.2d, v8.2d , v6.2d
|
|
SQSUB v14.2d, v10.2d , v4.2d
|
|
SQSUB v16.2d, v4.2d , v10.2d
|
|
|
|
//shrn v12.2s, v12.2d,#32
|
|
//shrn v14.2s, v14.2d,#32
|
|
//shrn v16.2s, v16.2d,#32
|
|
|
|
ST1 {v12.s}[0], [x3], x8
|
|
|
|
ST1 {v14.s}[0], [x0], #4
|
|
|
|
SQNEG v12.4s, v12.4s
|
|
|
|
|
|
ST1 {v12.s}[2], [x10], #4
|
|
|
|
ST1 {v16.s}[2], [x11], x8
|
|
|
|
LOOP1:
|
|
LD1 {v2.2s}, [x0]
|
|
LD1 {v3.2s}, [x10]
|
|
LDR w5, [x3] //RE2
|
|
sxtw x5, w5
|
|
LDR w6, [x11] //RE3
|
|
sxtw x6, w6
|
|
//VTRN.32 D2, D3
|
|
TRN1 v4.2s, v2.2s, v3.2s
|
|
TRN2 v3.2s, v2.2s, v3.2s
|
|
MOV v2.8b, v4.8b
|
|
|
|
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
|
|
sshr v4.2d, v4.2d, #16
|
|
sMULL v6.2d, v0.2s, v3.2s //add 2nd
|
|
sshr v6.2d, v6.2d, #16
|
|
sMULL v8.2d, v1.2s, v2.2s //add 1st
|
|
sshr v8.2d, v8.2d, #16
|
|
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
|
|
sshr v10.2d, v10.2d, #16
|
|
|
|
add v12.2d, v8.2d , v6.2d
|
|
SQSUB v14.2d, v4.2d , v10.2d
|
|
SQSUB v16.2d, v10.2d , v4.2d
|
|
|
|
//shrn v12.2s, v12.2d,#32
|
|
//shrn v14.2s, v14.2d,#32
|
|
//shrn v16.2s, v16.2d,#32
|
|
|
|
ST1 {v12.s}[0], [x0], #4
|
|
ST1 {v14.s}[0], [x3], x8
|
|
SQNEG v12.4s, v12.4s
|
|
|
|
ST1 {v12.s}[2], [x11], x8
|
|
ST1 {v16.s}[2], [x10], #4
|
|
|
|
MOV w19, #0
|
|
DUP V0.4s, w19
|
|
DUP V1.4s, w19
|
|
// second part
|
|
LD2 {v0.h, v1.h}[0], [x1], #4
|
|
sxtl v0.4s, v0.4h
|
|
sxtl v1.4s, v1.4h
|
|
dup v0.2s, v0.s[0]
|
|
dup v1.2s, v1.s[0]
|
|
|
|
mov v3.s[0], w5
|
|
mov v3.s[1], w6
|
|
LD1 {v2.s}[0], [x3]
|
|
LD1 {v2.s}[1], [x11]
|
|
|
|
sMULL v4.2d, v0.2s, v2.2s //qsub 2nd
|
|
sshr v4.2d, v4.2d, #16
|
|
sMULL v6.2d, v0.2s, v3.2s //add 2nd
|
|
sshr v6.2d, v6.2d, #16
|
|
sMULL v8.2d, v1.2s, v2.2s //add 1st
|
|
sshr v8.2d, v8.2d, #16
|
|
sMULL v10.2d, v1.2s, v3.2s //qsub 1st
|
|
sshr v10.2d, v10.2d, #16
|
|
|
|
add v12.2d, v4.2d , v10.2d
|
|
SQSUB v14.2d, v8.2d , v6.2d
|
|
SQSUB v16.2d, v6.2d , v8.2d
|
|
|
|
//shrn v12.2s, v12.2d,#32
|
|
//shrn v14.2s, v14.2d,#32
|
|
//shrn v16.2s, v16.2d,#32
|
|
|
|
ST1 {v12.s}[0], [x3], x8
|
|
ST1 {v14.s}[0], [x0], #4
|
|
|
|
SQNEG v12.4s, v12.4s
|
|
|
|
subs x4, x4, #1
|
|
ST1 {v12.s}[2], [x10], #4
|
|
ST1 {v16.s}[2], [x11], x8
|
|
|
|
BGT LOOP1
|
|
//VPOP {D8-D15}
|
|
// LDMFD sp!, {x4-x12, x15}
|
|
//ldp x19, x20,[sp],#16
|
|
pop_v_regs
|
|
ret
|