149 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			149 lines
		
	
	
		
			4.4 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| //.include "ihevc_neon_macros.s"
 | |
| .macro push_v_regs
 | |
|     stp             x8, x9, [sp, #-16]!
 | |
|     stp             x10, X11, [sp, #-16]!
 | |
|     stp             X12, X13, [sp, #-16]!
 | |
|     stp             X14, X15, [sp, #-16]!
 | |
|     stp             X29, X30, [sp, #-16]!
 | |
| .endm
 | |
| .macro pop_v_regs
 | |
|     ldp             X29, X30, [sp], #16
 | |
|     ldp             X14, X15, [sp], #16
 | |
|     ldp             X12, X13, [sp], #16
 | |
|     ldp             X10, X11, [sp], #16
 | |
|     ldp             X8, X9, [sp], #16
 | |
| .endm
 | |
| 
 | |
| .text
 | |
| .p2align 2
 | |
|         .global ixheaacd_postradixcompute4
 | |
| 
 | |
| 
 | |
| ixheaacd_postradixcompute4:
 | |
| 
 | |
|     // STMFD sp!, {x4-x12, x14}
 | |
|     push_v_regs
 | |
|     //SUB         sp, sp, #16
 | |
| 
 | |
|     //HARD CODED for FFT Length of 16
 | |
|     // x3 is always 16
 | |
| 
 | |
| 
 | |
|     //SUB         x4, x3, #2              ; y to y offset calculated
 | |
|     //MOV         x4, #14
 | |
|     //STR         x4, [sp, #8]            ; (npoints / 2)*4bytes - 4bytes
 | |
| 
 | |
|     //STR         x0, [sp, #12]           ; (3*(npoints/2))*4bytes - 4bytes
 | |
|                                         // x0 to x2 offset (npoints / 2)*4bytes
 | |
|     ADD             x4, x1, x3, lsl #1  // x1 -> x0, x4 -> x2
 | |
|     MOV             x3, #2
 | |
| 
 | |
| 
 | |
| POSTRADIX4_START:
 | |
| 
 | |
| //    LDMIA       x1!, {x5-x12}               // x_0 :x_7
 | |
| 
 | |
|     LDP             w5, w6, [x1], #8    // x_0 :x_1
 | |
|     LDP             w7, w8, [x1], #8    // x_2 :x_3
 | |
|     LDP             w9, w10, [x1], #8   // x_4 :x_5
 | |
|     LDP             w11, w12, [x1], #8  // x_6 :x_7
 | |
| 
 | |
|     ADD             w14, w5, w9         // xh0_0 = x_0 + x_4
 | |
|     SUB             w5, w5, w9          // xl0_0 = x_0 - x_4
 | |
| 
 | |
|     ADD             w9, w6, w10         // xh1_0 = x_1 + x_5
 | |
|     SUB             w6, w6, w10         // xl1_0 = x_1 - x_5
 | |
| 
 | |
|     ADD             w10, w7, w11        // xh0_1 = x_2 + x_6
 | |
|     SUB             w7, w7, w11         // xl0_1 = x_2 - x_6
 | |
| 
 | |
|     ADD             w11, w8, w12        // xh1_1 = x_3 + x_7
 | |
|     SUB             w8, w8, w12         // xl1_1 = x_3 - x_7
 | |
| 
 | |
|     ADD             w12, w14, w10       // n00 = xh0_0 + xh0_1
 | |
|     SUB             w14, w14, w10       // n20 = xh0_0 - xh0_1
 | |
| 
 | |
|     ADD             w10, w9, w11        // n01 = xh1_0 + xh1_1
 | |
|     SUB             w9, w9, w11         // n21 = xh1_0 - xh1_1
 | |
| 
 | |
|     ADD             w11, w5, w8         // n10 = xl0_0 + xl1_1
 | |
|     SUB             w5, w5, w8          // n30 = xl0_0 - xl1_1
 | |
| 
 | |
|     ADD             w8, w6, w7          // n31 = xl1_0 + xl0_1
 | |
|     SUB             w6, w6, w7          // n11 = xl1_0 - xl0_1
 | |
| 
 | |
| 
 | |
|     STR             w12, [x0], #4       // y0[h2] = n00, x7 -> y0[h2 + 1]
 | |
| 
 | |
|     STR             w10, [x0], #14<<1   // y0[h2 + 1] = n01, x7 -> y1[h2]
 | |
| 
 | |
|     STR             w11, [x0], #4       // y1[h2] = n10, x7 -> y1[h2 + 1]
 | |
|     STR             w6 , [x0], #14<<1   // y1[h2 + 1] = n11, x7 -> y2[h2]
 | |
| 
 | |
|     STR             w14, [x0], #4       // y2[h2] = n20, x7 -> y2[h2 + 1]
 | |
|     STR             w9 , [x0], #14<<1   // y2[h2 + 1] = n21, x7 -> y3[h2]
 | |
| 
 | |
|     STR             w5, [x0], #4        // y3[h2] = n30, x7 -> y3[h2 + 1]
 | |
|     STR             w8, [x0], #0        // y3[h2 + 1] = n31, x7 -> y0[h2+2]
 | |
| 
 | |
| //    LDMIA       x4!, {x5-x12}               // x_0 :x_7
 | |
| 
 | |
|     LDP             w5, w6, [x4], #8    // x_8 :x_8
 | |
|     LDP             w7, w8, [x4], #8    // x_a :x_b
 | |
|     LDP             w9, w10, [x4], #8   // x_c :x_d
 | |
|     LDP             w11, w12, [x4], #8  // x_e :x_f
 | |
| 
 | |
|     SUB             x0, x0, #92         // #4*3 + #14<<1 * 3 - 8
 | |
| 
 | |
| 
 | |
|     ADD             w14, w5, w9
 | |
|     SUB             w5, w5, w9
 | |
| 
 | |
|     ADD             w9, w6, w10
 | |
|     SUB             w6, w6, w10
 | |
| 
 | |
|     ADD             w10, w7, w11
 | |
|     SUB             w7, w7, w11
 | |
| 
 | |
|     ADD             w11, w8, w12
 | |
|     SUB             w8, w8, w12
 | |
| 
 | |
|     ADD             w12, w14, w10
 | |
|     SUB             w14, w14, w10
 | |
| 
 | |
|     ADD             w10, w9, w11
 | |
|     SUB             w9, w9, w11
 | |
| 
 | |
|     ADD             w11, w5, w8
 | |
|     SUB             w5, w5, w8
 | |
| 
 | |
|     ADD             w8, w6, w7
 | |
|     SUB             w6, w6, w7
 | |
| 
 | |
|     STR             w12, [x0], #4
 | |
|     STR             w10, [x0], #14<<1
 | |
| 
 | |
|     STR             w11, [x0], #4
 | |
|     STR             w6, [x0], #14<<1
 | |
| 
 | |
|     STR             w14, [x0], #4
 | |
|     STR             w9, [x0], #14<<1
 | |
| 
 | |
| 
 | |
|     STR             w5, [x0], #4
 | |
|     STR             w8, [x0], #0
 | |
| 
 | |
|     ADD             x1, x1, #1 << 5     // x0 += (Word32) npoints >> 1
 | |
|     ADD             x4, x4, #1 << 5     // x2 += (Word32) npoints >> 1
 | |
|     SUB             x0, x0, #100-8
 | |
| 
 | |
|     SUBS            w3, w3, #1
 | |
| 
 | |
|     BGT             POSTRADIX4_START
 | |
| 
 | |
|     // LDMFD sp!, {x4-x12, x15}
 | |
|     pop_v_regs
 | |
|     ret
 | |
| 
 | |
| 
 |