232 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			232 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| .macro push_v_regs
 | |
|     stp             q8, q9, [sp, #-32]!
 | |
|     stp             q10, q11, [sp, #-32]!
 | |
|     stp             q12, q13, [sp, #-32]!
 | |
|     stp             q14, q15, [sp, #-32]!
 | |
|     stp             X8, X9, [sp, #-16]!
 | |
|     stp             X10, X11, [sp, #-16]!
 | |
|     stp             X12, X13, [sp, #-16]!
 | |
|     stp             X14, X15, [sp, #-16]!
 | |
|     stp             X16, X17, [sp, #-16]!
 | |
|     stp             X18, X19, [sp, #-16]!
 | |
|     stp             X20, X21, [sp, #-16]!
 | |
|     stp             X29, X30, [sp, #-16]!
 | |
| .endm
 | |
| 
 | |
| .macro pop_v_regs
 | |
|     ldp             X29, X30, [sp], #16
 | |
|     ldp             X20, X21, [sp], #16
 | |
|     ldp             X18, X19, [sp], #16
 | |
|     ldp             X16, X17, [sp], #16
 | |
|     ldp             X14, X15, [sp], #16
 | |
|     ldp             X12, X13, [sp], #16
 | |
|     ldp             X10, X11, [sp], #16
 | |
|     ldp             X8, X9, [sp], #16
 | |
|     ldp             q14, q15, [sp], #32
 | |
|     ldp             q12, q13, [sp], #32
 | |
|     ldp             q10, q11, [sp], #32
 | |
|     ldp             q8, q9, [sp], #32
 | |
| .endm
 | |
| 
 | |
| .text
 | |
| .p2align 2
 | |
| 
 | |
|      .global ixheaacd_cos_sin_mod_loop1
 | |
| ixheaacd_cos_sin_mod_loop1:
 | |
| 
 | |
|     // STMFD sp!, {x4-x12, x14}
 | |
|     push_v_regs
 | |
|     //stp x19, x20,[sp,#-16]!
 | |
|     //VPUSH {D8-D11}
 | |
| //generating load addresses
 | |
|     ADD             x4, x0, x1, lsl #3  //psubband1
 | |
|     SUB             x4, x4, #4
 | |
|     ADD             x5, x3, x1, lsl #3  //psubband1_t
 | |
|     SUB             x5, x5, #8
 | |
|     ASR             x6, x1, #2
 | |
| 
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.8h, w19
 | |
| LOOP1:
 | |
| //first part
 | |
|     ld1             {v0.h}[0] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     ld1             {v0.h}[2] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     rev64           v1.2s, v0.2s
 | |
|     ld1             {v2.s}[0], [x0]
 | |
|     ADD             x0, x0, #4
 | |
|     ADD             x7, x0, #252
 | |
|     ld1             {v2.s}[1], [x7]
 | |
|     ld1             {v3.s}[0], [x4]
 | |
|     ADD             x7, x4, #256
 | |
|     ld1             {v3.s}[1], [x7]
 | |
|     SUB             x4, x4, #4
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
|     add             v0.4s, v8.4s , v6.4s
 | |
|     SQSUB           v2.4s, v10.4s , v4.4s
 | |
| 
 | |
|     //shrn  v0.2s, v0.2d,#32
 | |
|     //shrn  v2.2s, v2.2d,#32
 | |
|     mov             v3.16b, v0.16b
 | |
|     mov             v1.16b, v2.16b
 | |
|     ST2             {v0.s, v1.s}[0], [x3]
 | |
|     ADD             x3, x3, #8
 | |
|     ADD             x7, x3, #248
 | |
|     ST2             {v2.s, v3.s}[2], [x7]
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.8h, w19
 | |
| //second part
 | |
|     ld1             {v0.h}[0] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     ld1             {v0.h}[2] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     rev64           v1.2s, v0.2s
 | |
|     ld1             {v2.s}[0], [x0]
 | |
|     ADD             x0, x0, #4
 | |
|     ADD             x7, x0, #252
 | |
|     ld1             {v2.s}[1], [x7]
 | |
|     ld1             {v3.s}[0], [x4]
 | |
|     ADD             x7, x4, #256
 | |
|     ld1             {v3.s}[1], [x7]
 | |
|     SUB             x4, x4, #4
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
|     ADD             v0.4s, v10.4s , v4.4s
 | |
|     SQSUB           v2.4s, v8.4s , v6.4s
 | |
| 
 | |
|     //shrn  v0.2s, v0.2d,#32
 | |
|     //shrn  v2.2s, v2.2d,#32
 | |
|     mov             v3.16b, v0.16b
 | |
|     mov             v1.16b, v2.16b
 | |
|     ST2             {v0.s, v1.s}[0], [x5]
 | |
|     ADD             x7, x5, #256
 | |
|     ST2             {v2.s, v3.s}[2], [x7]
 | |
|     SUB             x5, x5, #8
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.8h, w19
 | |
| //Third part
 | |
|     ld1             {v0.h}[0] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     ld1             {v0.h}[2] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     rev64           v1.2s, v0.2s
 | |
|     ld1             {v2.s}[0], [x0], #4
 | |
|     ADD             x7, x0, #252
 | |
|     ld1             {v2.s}[1], [x7]
 | |
|     ld1             {v3.s}[0], [x4]
 | |
|     ADD             x7, x4, #256
 | |
|     ld1             {v3.s}[1], [x7]
 | |
|     SUB             x4, x4, #4
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
|     add             v0.4s, v8.4s , v6.4s
 | |
|     SQSUB           v2.4s, v10.4s , v4.4s
 | |
| 
 | |
|     //shrn  v0.2s, v0.2d,#32
 | |
|     //shrn  v2.2s, v2.2d,#32
 | |
|     mov             v3.16b, v0.16b
 | |
|     mov             v1.16b, v2.16b
 | |
|     ST2             {v0.s, v1.s}[0], [x3]
 | |
|     ADD             x3, x3, #8
 | |
|     ADD             x7, x3, #248
 | |
|     ST2             {v2.s, v3.s}[2], [x7]
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.8h, w19
 | |
| //Fourth part
 | |
|     ld1             {v0.h}[0] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     ld1             {v0.h}[2] , [x2]
 | |
|     ADD             x2, x2, #2
 | |
|     rev64           v1.2s, v0.2s
 | |
|     ld1             {v2.s}[0], [x0]
 | |
|     ADD             x0, x0, #4
 | |
|     ADD             x7, x0, #252
 | |
|     ld1             {v2.s}[1], [x7]
 | |
|     ld1             {v3.s}[0], [x4]
 | |
|     ADD             x7, x4, #256
 | |
|     ld1             {v3.s}[1], [x7]
 | |
|     SUB             x4, x4, #4
 | |
| 
 | |
|     sMULL           v4.2d, v0.2s, v2.2s //qsub 2nd
 | |
|     sshr            v4.2d, v4.2d, #16
 | |
|     sMULL           v6.2d, v0.2s, v3.2s //add 2nd
 | |
|     sshr            v6.2d, v6.2d, #16
 | |
|     sMULL           v8.2d, v1.2s, v2.2s //add 1st
 | |
|     sshr            v8.2d, v8.2d, #16
 | |
|     sMULL           v10.2d, v1.2s, v3.2s //qsub 1st
 | |
|     sshr            v10.2d, v10.2d, #16
 | |
| 
 | |
| 
 | |
|     ADD             v0.4s, v10.4s , v4.4s
 | |
|     SQSUB           v2.4s, v8.4s , v6.4s
 | |
| 
 | |
|     //shrn  v0.2s, v0.2d,#32
 | |
|     //shrn  v2.2s, v2.2d,#32
 | |
|     mov             v3.16b, v0.16b
 | |
|     mov             v1.16b, v2.16b
 | |
|     ST2             {v0.s, v1.s}[0], [x5]
 | |
|     ADD             x7, x5, #256
 | |
|     SUBS            x6, x6, #1
 | |
|     ST2             {v2.s, v3.s}[2], [x7]
 | |
|     SUB             x5, x5, #8
 | |
|     MOV             w19, #0
 | |
|     DUP             V0.8h, w19
 | |
|     BGT             LOOP1
 | |
|     //VPOP {D8-D11}
 | |
|     // LDMFD sp!, {x4-x12, x15}
 | |
|     //ldp x19, x20,[sp],#16
 | |
|     pop_v_regs
 | |
|     ret
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 |