214 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			214 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| .macro push_v_regs
 | |
|     stp             q8, q9, [sp, #-32]!
 | |
|     stp             q10, q11, [sp, #-32]!
 | |
|     stp             q12, q13, [sp, #-32]!
 | |
|     stp             q14, q15, [sp, #-32]!
 | |
|     stp             X8, X9, [sp, #-16]!
 | |
|     stp             X10, X11, [sp, #-16]!
 | |
|     stp             X12, X13, [sp, #-16]!
 | |
|     stp             X14, X15, [sp, #-16]!
 | |
|     stp             X16, X17, [sp, #-16]!
 | |
|     stp             X18, X19, [sp, #-16]!
 | |
|     stp             X20, X21, [sp, #-16]!
 | |
|     stp             X29, X30, [sp, #-16]!
 | |
| .endm
 | |
| 
 | |
| .macro pop_v_regs
 | |
|     ldp             X29, X30, [sp], #16
 | |
|     ldp             X20, X21, [sp], #16
 | |
|     ldp             X18, X19, [sp], #16
 | |
|     ldp             X16, X17, [sp], #16
 | |
|     ldp             X14, X15, [sp], #16
 | |
|     ldp             X12, X13, [sp], #16
 | |
|     ldp             X10, X11, [sp], #16
 | |
|     ldp             X8, X9, [sp], #16
 | |
|     ldp             q14, q15, [sp], #32
 | |
|     ldp             q12, q13, [sp], #32
 | |
|     ldp             q10, q11, [sp], #32
 | |
|     ldp             q8, q9, [sp], #32
 | |
| .endm
 | |
| 
 | |
| .text
 | |
| .p2align 2
 | |
| 
 | |
|      .global ixheaacd_cos_sin_mod_loop2
 | |
| ixheaacd_cos_sin_mod_loop2:
 | |
| 
 | |
|     // STMFD sp!, {x4-x12, x14}
 | |
|     push_v_regs
 | |
|     //stp x19, x20,[sp,#-16]!
 | |
|     //VPUSH {D8-D15}
 | |
|     //generating load addresses
 | |
|     ADD             x3, x0, x2, LSL #3  //psubband1 = &subband[2 * M - 1];
 | |
|     SUB             x3, x3, #4
 | |
|     ADD             x10, x0, #256
 | |
|     ADD             x11, x10, x2, LSL #3
 | |
|     SUB             x11, x11, #4
 | |
|     MOV             x8, #-4
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.4s, w19
 | |
|     DUP             V1.4s, w19
 | |
| 
 | |
|     LDR             w6, [x0]
 | |
|     sxtw            x6, w6
 | |
|     ASR             x4, x2, #1          //M_2 = ixheaacd_shx32(M, 1);
 | |
|     SUB             x4, x4, #1
 | |
| 
 | |
|     ASR             x6, x6, #1          //*psubband = *psubband >> 1;
 | |
|     LD1             {v2.s}[0], [x3]
 | |
| 
 | |
|     STR             w6, [x0], #4        //psubband++;
 | |
|     sxtw            x6, w6
 | |
|     LDR             w7, [x0]
 | |
|     sxtw            x7, w7
 | |
|     ASR             x7, x7, #1
 | |
|     sub             x20, x7, #0
 | |
|     neg             x6, x20
 | |
|     STR             w6, [x3], #-4
 | |
|     sxtw            x6, w6
 | |
|     LD1             {v3.s}[0], [x3]     //  im = *psubband1;
 | |
| 
 | |
|     LD2             {v0.h, v1.h}[0], [x1], #4
 | |
|     sxtl            v0.4s, v0.4h
 | |
|     sxtl            v1.4s, v1.4h
 | |
|     dup             v0.2s, v0.s[0]
 | |
|     dup             v1.2s, v1.s[0]
 | |
| 
 | |
|     LD1             {v2.s}[1], [x11]    //re = *psubband12;
 | |
| 
 | |
| //    LDR w6,  [x10]
 | |
| //  sxtw x6,w6
 | |
| //    ASR x7, x6, #1
 | |
| //    MOV x9, #0
 | |
| //    QSUB x7, x9, x7
 | |
|     LD1             {v4.s}[0], [x10]
 | |
|     SSHR            v4.2s, v4.2s, #1
 | |
|     MOV             x9, #0
 | |
|     DUP             v6.2s, w9
 | |
|     SQSUB           v4.2s, v6.2s, v4.2s
 | |
| 
 | |
|     ST1             {v4.s}[0], [x11]
 | |
| //  str     X7, [X11]
 | |
|     SUB             x11, x11, #4
 | |
| //  sxtw x7,w7
 | |
| 
 | |
|     LDR             w6, [x10, #4]
 | |
|     sxtw            x6, w6
 | |
|     ASR             x6, x6, #1
 | |
|     STR             w6, [x10], #4
 | |
|     sxtw            x6, w6
 | |
| 
 | |
|     LD1             {v3.s}[1], [x11]
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
|     add             v12.2d, v8.2d , v6.2d
 | |
|     SQSUB           v14.2d, v10.2d , v4.2d
 | |
|     SQSUB           v16.2d, v4.2d , v10.2d
 | |
| 
 | |
|     //shrn  v12.2s, v12.2d,#32
 | |
|     //shrn  v14.2s, v14.2d,#32
 | |
|     //shrn  v16.2s, v16.2d,#32
 | |
| 
 | |
|     ST1             {v12.s}[0], [x3], x8
 | |
| 
 | |
|     ST1             {v14.s}[0], [x0], #4
 | |
| 
 | |
|     SQNEG           v12.4s, v12.4s
 | |
| 
 | |
| 
 | |
|     ST1             {v12.s}[2], [x10], #4
 | |
| 
 | |
|     ST1             {v16.s}[2], [x11], x8
 | |
| 
 | |
| LOOP1:
 | |
|     LD1             {v2.2s}, [x0]
 | |
|     LD1             {v3.2s}, [x10]
 | |
|     LDR             w5, [x3]            //RE2
 | |
|     sxtw            x5, w5
 | |
|     LDR             w6, [x11]           //RE3
 | |
|     sxtw            x6, w6
 | |
|     //VTRN.32 D2, D3
 | |
|     TRN1            v4.2s, v2.2s, v3.2s
 | |
|     TRN2            v3.2s, v2.2s, v3.2s
 | |
|     MOV             v2.8b, v4.8b
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
|     add             v12.2d, v8.2d , v6.2d
 | |
|     SQSUB           v14.2d, v4.2d , v10.2d
 | |
|     SQSUB           v16.2d, v10.2d , v4.2d
 | |
| 
 | |
|     //shrn  v12.2s, v12.2d,#32
 | |
|     //shrn  v14.2s, v14.2d,#32
 | |
|     //shrn  v16.2s, v16.2d,#32
 | |
| 
 | |
|     ST1             {v12.s}[0], [x0], #4
 | |
|     ST1             {v14.s}[0], [x3], x8
 | |
|     SQNEG           v12.4s, v12.4s
 | |
| 
 | |
|     ST1             {v12.s}[2], [x11], x8
 | |
|     ST1             {v16.s}[2], [x10], #4
 | |
| 
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.4s, w19
 | |
|     DUP             V1.4s, w19
 | |
|     // second part
 | |
|     LD2             {v0.h, v1.h}[0], [x1], #4
 | |
|     sxtl            v0.4s, v0.4h
 | |
|     sxtl            v1.4s, v1.4h
 | |
|     dup             v0.2s, v0.s[0]
 | |
|     dup             v1.2s, v1.s[0]
 | |
| 
 | |
|     mov             v3.s[0], w5
 | |
|     mov             v3.s[1], w6
 | |
|     LD1             {v2.s}[0], [x3]
 | |
|     LD1             {v2.s}[1], [x11]
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
|     add             v12.2d, v4.2d , v10.2d
 | |
|     SQSUB           v14.2d, v8.2d , v6.2d
 | |
|     SQSUB           v16.2d, v6.2d , v8.2d
 | |
| 
 | |
|     //shrn  v12.2s, v12.2d,#32
 | |
|     //shrn  v14.2s, v14.2d,#32
 | |
|     //shrn  v16.2s, v16.2d,#32
 | |
| 
 | |
|     ST1             {v12.s}[0], [x3], x8
 | |
|     ST1             {v14.s}[0], [x0], #4
 | |
| 
 | |
|     SQNEG           v12.4s, v12.4s
 | |
| 
 | |
|     subs            x4, x4, #1
 | |
|     ST1             {v12.s}[2], [x10], #4
 | |
|     ST1             {v16.s}[2], [x11], x8
 | |
| 
 | |
|     BGT             LOOP1
 | |
|     //VPOP {D8-D15}
 | |
|     // LDMFD sp!, {x4-x12, x15}
 | |
|     //ldp x19, x20,[sp],#16
 | |
|     pop_v_regs
 | |
|     ret
 |