.macro push_v_regs stp q8, q9, [sp, #-32]! stp q10, q11, [sp, #-32]! stp q12, q13, [sp, #-32]! stp q14, q15, [sp, #-32]! stp X8, X9, [sp, #-16]! stp X10, X11, [sp, #-16]! stp X12, X13, [sp, #-16]! stp X14, X15, [sp, #-16]! stp X16, X17, [sp, #-16]! stp X18, X19, [sp, #-16]! stp X20, X21, [sp, #-16]! stp X29, X30, [sp, #-16]! .endm .macro pop_v_regs ldp X29, X30, [sp], #16 ldp X20, X21, [sp], #16 ldp X18, X19, [sp], #16 ldp X16, X17, [sp], #16 ldp X14, X15, [sp], #16 ldp X12, X13, [sp], #16 ldp X10, X11, [sp], #16 ldp X8, X9, [sp], #16 ldp q14, q15, [sp], #32 ldp q12, q13, [sp], #32 ldp q10, q11, [sp], #32 ldp q8, q9, [sp], #32 .endm .text .p2align 2 .global ixheaacd_cos_sin_mod_loop2 ixheaacd_cos_sin_mod_loop2: // STMFD sp!, {x4-x12, x14} push_v_regs //stp x19, x20,[sp,#-16]! //VPUSH {D8-D15} //generating load addresses ADD x3, x0, x2, LSL #3 //psubband1 = &subband[2 * M - 1]; SUB x3, x3, #4 ADD x10, x0, #256 ADD x11, x10, x2, LSL #3 SUB x11, x11, #4 MOV x8, #-4 MOV w19, #0 DUP V0.4s, w19 DUP V1.4s, w19 LDR w6, [x0] sxtw x6, w6 ASR x4, x2, #1 //M_2 = ixheaacd_shx32(M, 1); SUB x4, x4, #1 ASR x6, x6, #1 //*psubband = *psubband >> 1; LD1 {v2.s}[0], [x3] STR w6, [x0], #4 //psubband++; sxtw x6, w6 LDR w7, [x0] sxtw x7, w7 ASR x7, x7, #1 sub x20, x7, #0 neg x6, x20 STR w6, [x3], #-4 sxtw x6, w6 LD1 {v3.s}[0], [x3] // im = *psubband1; LD2 {v0.h, v1.h}[0], [x1], #4 sxtl v0.4s, v0.4h sxtl v1.4s, v1.4h dup v0.2s, v0.s[0] dup v1.2s, v1.s[0] LD1 {v2.s}[1], [x11] //re = *psubband12; // LDR w6, [x10] // sxtw x6,w6 // ASR x7, x6, #1 // MOV x9, #0 // QSUB x7, x9, x7 LD1 {v4.s}[0], [x10] SSHR v4.2s, v4.2s, #1 MOV x9, #0 DUP v6.2s, w9 SQSUB v4.2s, v6.2s, v4.2s ST1 {v4.s}[0], [x11] // str X7, [X11] SUB x11, x11, #4 // sxtw x7,w7 LDR w6, [x10, #4] sxtw x6, w6 ASR x6, x6, #1 STR w6, [x10], #4 sxtw x6, w6 LD1 {v3.s}[1], [x11] sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 add v12.2d, v8.2d , v6.2d SQSUB v14.2d, v10.2d , v4.2d SQSUB v16.2d, v4.2d , v10.2d //shrn v12.2s, v12.2d,#32 //shrn v14.2s, v14.2d,#32 //shrn v16.2s, v16.2d,#32 ST1 {v12.s}[0], [x3], x8 ST1 {v14.s}[0], [x0], #4 SQNEG v12.4s, v12.4s ST1 {v12.s}[2], [x10], #4 ST1 {v16.s}[2], [x11], x8 LOOP1: LD1 {v2.2s}, [x0] LD1 {v3.2s}, [x10] LDR w5, [x3] //RE2 sxtw x5, w5 LDR w6, [x11] //RE3 sxtw x6, w6 //VTRN.32 D2, D3 TRN1 v4.2s, v2.2s, v3.2s TRN2 v3.2s, v2.2s, v3.2s MOV v2.8b, v4.8b sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 add v12.2d, v8.2d , v6.2d SQSUB v14.2d, v4.2d , v10.2d SQSUB v16.2d, v10.2d , v4.2d //shrn v12.2s, v12.2d,#32 //shrn v14.2s, v14.2d,#32 //shrn v16.2s, v16.2d,#32 ST1 {v12.s}[0], [x0], #4 ST1 {v14.s}[0], [x3], x8 SQNEG v12.4s, v12.4s ST1 {v12.s}[2], [x11], x8 ST1 {v16.s}[2], [x10], #4 MOV w19, #0 DUP V0.4s, w19 DUP V1.4s, w19 // second part LD2 {v0.h, v1.h}[0], [x1], #4 sxtl v0.4s, v0.4h sxtl v1.4s, v1.4h dup v0.2s, v0.s[0] dup v1.2s, v1.s[0] mov v3.s[0], w5 mov v3.s[1], w6 LD1 {v2.s}[0], [x3] LD1 {v2.s}[1], [x11] sMULL v4.2d, v0.2s, v2.2s //qsub 2nd sshr v4.2d, v4.2d, #16 sMULL v6.2d, v0.2s, v3.2s //add 2nd sshr v6.2d, v6.2d, #16 sMULL v8.2d, v1.2s, v2.2s //add 1st sshr v8.2d, v8.2d, #16 sMULL v10.2d, v1.2s, v3.2s //qsub 1st sshr v10.2d, v10.2d, #16 add v12.2d, v4.2d , v10.2d SQSUB v14.2d, v8.2d , v6.2d SQSUB v16.2d, v6.2d , v8.2d //shrn v12.2s, v12.2d,#32 //shrn v14.2s, v14.2d,#32 //shrn v16.2s, v16.2d,#32 ST1 {v12.s}[0], [x3], x8 ST1 {v14.s}[0], [x0], #4 SQNEG v12.4s, v12.4s subs x4, x4, #1 ST1 {v12.s}[2], [x10], #4 ST1 {v16.s}[2], [x11], x8 BGT LOOP1 //VPOP {D8-D15} // LDMFD sp!, {x4-x12, x15} //ldp x19, x20,[sp],#16 pop_v_regs ret