778 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			778 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| ///******************************************************************************
 | |
| // *
 | |
| // * Copyright (C) 2018 The Android Open Source Project
 | |
| // *
 | |
| // * Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // * you may not use this file except in compliance with the License.
 | |
| // * You may obtain a copy of the License at:
 | |
| // *
 | |
| // * http://www.apache.org/licenses/LICENSE-2.0
 | |
| // *
 | |
| // * Unless required by applicable law or agreed to in writing, software
 | |
| // * distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // * See the License for the specific language governing permissions and
 | |
| // * limitations under the License.
 | |
| // *
 | |
| // *****************************************************************************
 | |
| // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 | |
| //*/
 | |
| 
 | |
| 
 | |
| .macro push_v_regs
 | |
|     stp             d8, d9, [sp, #-16]!
 | |
|     stp             d10, d11, [sp, #-16]!
 | |
|     stp             d12, d13, [sp, #-16]!
 | |
|     stp             d14, d15, [sp, #-16]!
 | |
|     stp             X8, X9, [sp, #-16]!
 | |
|     stp             X10, X11, [sp, #-16]!
 | |
|     stp             X12, X13, [sp, #-16]!
 | |
|     stp             X14, X15, [sp, #-16]!
 | |
|     stp             X16, X17, [sp, #-16]!
 | |
|     stp             X29, X30, [sp, #-16]!
 | |
| .endm
 | |
| .macro pop_v_regs
 | |
|     ldp             X29, X30, [sp], #16
 | |
|     ldp             X16, X17, [sp], #16
 | |
|     ldp             X14, X15, [sp], #16
 | |
|     ldp             X12, X13, [sp], #16
 | |
|     ldp             X10, X11, [sp], #16
 | |
|     ldp             X8, X9, [sp], #16
 | |
|     ldp             d14, d15, [sp], #16
 | |
|     ldp             d12, d13, [sp], #16
 | |
|     ldp             d10, d11, [sp], #16
 | |
|     ldp             d8, d9, [sp], #16
 | |
| .endm
 | |
| 
 | |
| .macro swp reg1, reg2
 | |
|     MOV             x16, \reg1
 | |
|     MOV             \reg1, \reg2
 | |
|     MOV             \reg2, x16
 | |
| .endm
 | |
| .text
 | |
| .p2align 2
 | |
| .global ixheaacd_sbr_imdct_using_fft
 | |
| ixheaacd_sbr_imdct_using_fft:
 | |
|     push_v_regs
 | |
| 
 | |
| 
 | |
| COND_6: cmp         x1, #0x10
 | |
|     bne             COND_7
 | |
|     MOV             X8, #1
 | |
|     MOV             X4, X7
 | |
|     B               RADIX_4_FIRST_START
 | |
| 
 | |
| COND_7: cmp         x1, #0x20
 | |
| 
 | |
|     mov             x8, #1
 | |
|     mov             x4, x7
 | |
| 
 | |
| 
 | |
| RADIX_8_FIRST_START:
 | |
| 
 | |
|     LSR             W9 , W1, #5
 | |
|     LSL             W1, W1, #1
 | |
| 
 | |
| RADIX_8_FIRST_LOOP:
 | |
| 
 | |
|     MOV             X5 , X2
 | |
|     MOV             X6 , X2
 | |
|     MOV             X7 , X2
 | |
|     MOV             X11 , X2
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     LDRB            W12, [X4]
 | |
|     ADD             X5, X5, X12, LSL #3
 | |
|     LD2             {V0.S, V1.S}[0], [X5], X1
 | |
|     ADD             X5, X5, X1
 | |
|     LD2             {V4.S, V5.S}[0], [X5], X1
 | |
|     SUB             X5, X5, X1, LSL #1
 | |
|     LD2             {V2.S, V3.S}[0], [X5], X1
 | |
|     ADD             X5, X5, X1
 | |
|     LD2             {V6.S, V7.S}[0], [X5], X1
 | |
|     SUB             X5, X5, X1, LSL #2
 | |
| 
 | |
|     LDRB            W12, [X4, #1]
 | |
|     ADD             X6, X6, X12, LSL #3
 | |
|     LD2             {V0.S, V1.S}[1], [X6] , X1
 | |
|     ADD             X6, X6, X1
 | |
|     LD2             {V4.S, V5.S}[1], [X6] , X1
 | |
|     SUB             X6, X6, X1, LSL #1
 | |
|     LD2             {V2.S, V3.S}[1], [X6] , X1
 | |
|     ADD             X6, X6, X1
 | |
|     LD2             {V6.S, V7.S}[1], [X6], X1
 | |
|     SUB             X6, X6, X1, LSL #2
 | |
| 
 | |
| 
 | |
|     LDRB            W12, [X4, #2]
 | |
|     ADD             X7, X7, X12, LSL #3
 | |
|     LD2             {V0.S, V1.S}[2], [X7] , X1
 | |
|     ADD             X7, X7, X1
 | |
|     LD2             {V4.S, V5.S}[2], [X7] , X1
 | |
|     SUB             X7, X7, X1, LSL #1
 | |
| 
 | |
|     LDRB            W12, [X4, #3]
 | |
|     ADD             X11, X11, X12, LSL #3
 | |
|     LD2             {V0.S, V1.S}[3], [X11] , X1
 | |
|     ADD             X11, X11, X1
 | |
|     LD2             {V4.S, V5.S}[3], [X11] , X1
 | |
|     SUB             X11, X11, X1, LSL #1
 | |
| 
 | |
| 
 | |
|     ADD             V8.4S, V0.4S, V4.4S
 | |
|     LD2             {V2.S, V3.S}[2], [X7] , X1
 | |
|     ADD             X7, X7, X1
 | |
| 
 | |
| 
 | |
|     SUB             V9.4S, V0.4S, V4.4S
 | |
|     LD2             {V6.S, V7.S}[2], [X7], X1
 | |
|     SUB             X7, X7, X1, LSL #2
 | |
| 
 | |
| 
 | |
|     ADD             V0.4S, V1.4S, V5.4S
 | |
|     LD2             {V2.S, V3.S}[3], [X11] , X1
 | |
|     ADD             X11, X11, X1
 | |
| 
 | |
|     SUB             V4.4S, V1.4S, V5.4S
 | |
|     LD2             {V6.S, V7.S}[3], [X11], X1
 | |
|     SUB             X11, X11, X1, LSL #2
 | |
| 
 | |
|     ADD             X4, X4, #4
 | |
| 
 | |
|     ADD             X5, X5, X1, LSR #1
 | |
|     ADD             X6, X6, X1, LSR #1
 | |
|     ADD             X7, X7, X1, LSR #1
 | |
|     ADD             X11, X11, X1, LSR #1
 | |
| 
 | |
| 
 | |
|     ADD             V1.4S, V2.4S, V6.4S
 | |
|     LD2             {V14.S, V15.S}[0], [X5] , X1
 | |
| 
 | |
| 
 | |
|     SUB             V5.4S, V2.4S, V6.4S
 | |
|     LD2             {V10.S, V11.S}[0], [X5] , X1
 | |
| 
 | |
| 
 | |
|     ADD             V2.4S, V3.4S, V7.4S
 | |
|     LD2             {V12.S, V13.S}[0], [X5] , X1
 | |
| 
 | |
| 
 | |
|     SUB             V6.4S, V3.4S, V7.4S
 | |
|     LD2             {V14.S, V15.S}[1], [X6] , X1
 | |
| 
 | |
|     ADD             V3.4S, V9.4S, V6.4S
 | |
|     LD2             {V10.S, V11.S}[1], [X6] , X1
 | |
| 
 | |
|     SUB             V7.4S, V9.4S, V6.4S
 | |
|     LD2             {V12.S, V13.S}[1], [X6] , X1
 | |
| 
 | |
|     SUB             V6.4S, V4.4S, V5.4S
 | |
|     LD2             {V14.S, V15.S}[2], [X7] , X1
 | |
| 
 | |
|     ADD             V9.4S, V4.4S, V5.4S
 | |
|     LD2             {V10.S, V11.S}[2], [X7] , X1
 | |
| 
 | |
|     ADD             V4.4S, V8.4S, V1.4S
 | |
|     LD2             {V12.S, V13.S}[2], [X7] , X1
 | |
| 
 | |
|     SUB             V5.4S, V8.4S, V1.4S
 | |
|     LD2             {V14.S, V15.S}[3], [X11] , X1
 | |
| 
 | |
|     ADD             V8.4S, V0.4S, V2.4S
 | |
|     LD2             {V10.S, V11.S}[3], [X11] , X1
 | |
| 
 | |
|     SUB             V0.4S, V0.4S, V2.4S
 | |
|     LD2             {V12.S, V13.S}[3], [X11] , X1
 | |
| 
 | |
| 
 | |
|     LD2             {V1.S, V2.S}[0], [X5], X1
 | |
| 
 | |
|     ADD             V17.4S, V14.4S, V12.4S
 | |
| 
 | |
|     LD2             {V1.S, V2.S}[1], [X6] , X1
 | |
| 
 | |
|     SUB             V16.4S, V14.4S, V12.4S
 | |
| 
 | |
|     LD2             {V1.S, V2.S}[2], [X7] , X1
 | |
| 
 | |
|     ADD             V14.4S, V15.4S, V13.4S
 | |
| 
 | |
|     LD2             {V1.S, V2.S}[3], [X11] , X1
 | |
| 
 | |
|     SUB             V12.4S, V15.4S, V13.4S
 | |
| 
 | |
|     ADD             V15.4S, V10.4S, V1.4S
 | |
|     SUB             V13.4S, V10.4S, V1.4S
 | |
|     ADD             V10.4S, V11.4S, V2.4S
 | |
|     SUB             V1.4S, V11.4S, V2.4S
 | |
| 
 | |
|     ADD             V11.4S, V17.4S, V15.4S
 | |
|     SUB             V2.4S, V17.4S, V15.4S
 | |
|     ADD             V17.4S, V14.4S, V10.4S
 | |
|     SUB             V15.4S, V14.4S, V10.4S
 | |
| 
 | |
|     ADD             V14.4S, V16.4S, V12.4S
 | |
|     SUB             V10.4S, V16.4S, V12.4S
 | |
|     ADD             V16.4S, V13.4S, V1.4S
 | |
|     SUB             V12.4S, V13.4S, V1.4S
 | |
| 
 | |
|     ADD             V1.4S , V14.4S, V12.4S
 | |
|     SUB             V13.4S, V14.4S, V12.4S
 | |
|     SUB             V12.4S, V16.4S, V10.4S
 | |
| 
 | |
|     UZP1            V22.8H, V1.8H, V1.8H
 | |
|     UZP2            V23.8H, V1.8H, V1.8H
 | |
|     ADD             V14.4S, V16.4S, V10.4S
 | |
| 
 | |
|     UZP1            V26.8H, V13.8H, V13.8H
 | |
|     UZP2            V27.8H, V13.8H, V13.8H
 | |
|     ADD             V16.4S, V4.4S, V11.4S
 | |
| 
 | |
|     UZP1            V24.8H, V12.8H, V12.8H
 | |
|     UZP2            V25.8H, V12.8H, V12.8H
 | |
|     SUB             V10.4S, V4.4S, V11.4S
 | |
| 
 | |
|     UZP1            V28.8H, V14.8H, V14.8H
 | |
|     UZP2            V29.8H, V14.8H, V14.8H
 | |
|     ADD             V4.4S, V8.4S, V17.4S
 | |
| 
 | |
|     MOV             W14, #0x5a82
 | |
| 
 | |
|     SUB             V11.4S, V8.4S, V17.4S
 | |
| 
 | |
|     ADD             V8.4S, V5.4S, V15.4S
 | |
|     SUB             V17.4S, V5.4S, V15.4S
 | |
|     SUB             V5.4S, V0.4S, V2.4S
 | |
|     ADD             V15.4S, V0.4S, V2.4S
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     DUP             V31.4H, W14
 | |
| 
 | |
|     UMULL           V19.4S, V26.4H, V31.4H
 | |
|     UMULL           V18.4S, V28.4H, V31.4H
 | |
|     SSHR            V19.4S, V19.4S, #15
 | |
|     SSHR            V18.4S, V18.4S, #15
 | |
| 
 | |
|     SQDMLAL         V19.4S, V27.4H, V31.4H
 | |
|     SQDMLAL         V18.4S, V29.4H, V31.4H
 | |
| 
 | |
|     UMULL           V13.4S, V24.4H, V31.4H
 | |
|     UMULL           V14.4S, V22.4H, V31.4H
 | |
| 
 | |
|     ADD             V20.4S, V3.4S, V19.4S
 | |
|     SUB             V21.4S, V3.4S, V19.4S
 | |
|     ADD             V30.4S, V6.4S, V18.4S
 | |
|     SUB             V6.4S, V6.4S, V18.4S
 | |
| 
 | |
|     SSHR            V13.4S, V13.4S, #15
 | |
|     SSHR            V14.4S, V14.4S, #15
 | |
| 
 | |
|     SQDMLAL         V13.4S, V25.4H, V31.4H
 | |
|     SQDMLAL         V14.4S, V23.4H, V31.4H
 | |
| 
 | |
|     ADD             V3.4S, V7.4S, V13.4S
 | |
|     SUB             V19.4S, V7.4S, V13.4S
 | |
|     ADD             V1.4S, V9.4S, V14.4S
 | |
|     SUB             V18.4S, V9.4S, V14.4S
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     swp             V17.D[0], V8.D[0]
 | |
|     swp             V17.D[1], V8.D[1]
 | |
|     swp             V4.D[0], V16.D[0]
 | |
|     swp             V4.D[1], V16.D[1]
 | |
| 
 | |
|     TRN1            V12.4S, V4.4S, V20.4S
 | |
|     TRN2            V22.4S, V4.4S, V20.4S
 | |
| 
 | |
|     SHL             V12.4S, V12.4S, #1
 | |
|     TRN1            V9.4S, V17.4S, V3.4S
 | |
|     TRN2            V2.4S, V17.4S, V3.4S
 | |
|     SHL             V22.4S, V22.4S, #1
 | |
| 
 | |
|     SHL             V9.4S, V9.4S, #1
 | |
|     TRN1            V24.4S, V10.4S, V21.4S
 | |
|     TRN2            V7.4S, V10.4S, V21.4S
 | |
|     SHL             V2.4S, V2.4S, #1
 | |
| 
 | |
|     SHL             V24.4S, V24.4S, #1
 | |
|     TRN1            V13.4S, V16.4S, V6.4S
 | |
|     TRN2            V23.4S, V16.4S, V6.4S
 | |
|     SHL             V7.4S, V7.4S, #1
 | |
| 
 | |
|     SHL             V13.4S, V13.4S, #1
 | |
|     TRN1            V10.4S, V5.4S, V18.4S
 | |
|     TRN2            V3.4S, V5.4S, V18.4S
 | |
|     SHL             V23.4S, V23.4S, #1
 | |
| 
 | |
|     SHL             V10.4S, V10.4S, #1
 | |
|     TRN1            V26.4S, V8.4S, V19.4S
 | |
|     TRN2            V4.4S, V8.4S, V19.4S
 | |
|     SHL             V3.4S, V3.4S, #1
 | |
| 
 | |
|     SHL             V26.4S, V26.4S, #1
 | |
|     TRN1            V25.4S, V11.4S, V30.4S
 | |
|     TRN2            V8.4S, V11.4S, V30.4S
 | |
|     SHL             V4.4S, V4.4S, #1
 | |
| 
 | |
|     SHL             V25.4S, V25.4S, #1
 | |
|     TRN1            V27.4S, V15.4S, V1.4S
 | |
|     TRN2            V5.4S, V15.4S, V1.4S
 | |
|     SHL             V8.4S, V8.4S, #1
 | |
| 
 | |
|     SHL             V27.4S, V27.4S, #1
 | |
|     swp             V9.D[0], V12.D[1]
 | |
|     SHL             V5.4S, V5.4S, #1
 | |
|     swp             V2.D[0], V22.D[1]
 | |
| 
 | |
|     swp             V24.D[1], V26.D[0]
 | |
|     swp             V7.D[1], V4.D[0]
 | |
|     swp             V10.D[0], V13.D[1]
 | |
|     swp             V3.D[0], V23.D[1]
 | |
|     swp             V27.D[0], V25.D[1]
 | |
|     swp             V5.D[0], V8.D[1]
 | |
| 
 | |
| 
 | |
|     MOV             X15, #32
 | |
|     ST2             {V12.4S, V13.4S}, [X3], X15
 | |
|     ST2             {V24.4S, V25.4S}, [X3], X15
 | |
|     ST2             {V22.4S, V23.4S}, [X3], X15
 | |
|     ST2             {V7.4S, V8.4S}, [X3], X15
 | |
|     ST2             {V9.4S, V10.4S}, [X3], X15
 | |
|     ST2             {V26.4S, V27.4S}, [X3], X15
 | |
|     ST2             {V2.4S, V3.4S}, [X3], X15
 | |
|     ST2             {V4.4S, V5.4S}, [X3], X15
 | |
| 
 | |
| 
 | |
|     SUBS            X9, X9, #1
 | |
|     BNE             RADIX_8_FIRST_LOOP
 | |
| 
 | |
|     LSR             X1, X1, #1
 | |
|     LSL             X15, X1, #3
 | |
|     SUB             X3, X3, X15
 | |
| 
 | |
|     MOV             X5, #8
 | |
|     MOV             X4, #32
 | |
|     LSR             X15, X1, #5
 | |
|     MOV             X6, X15
 | |
|     B               RADIX_4_FIRST_ENDS
 | |
| 
 | |
| RADIX_8_FIRST_ENDS:
 | |
| 
 | |
| 
 | |
| 
 | |
| RADIX_4_FIRST_START:
 | |
| 
 | |
| 
 | |
|     LSR             W9, W1, #4
 | |
|     LSL             W1, W1, #1
 | |
| 
 | |
| RADIX_4_LOOP:
 | |
| 
 | |
|     MOV             X5 , X2
 | |
|     MOV             X6 , X2
 | |
|     MOV             X7 , X2
 | |
|     MOV             X11 , X2
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     LDRB            W12, [X4, #0]
 | |
|     ADD             X5, X5, X12, LSL #3
 | |
| 
 | |
|     LD2             {V0.S, V1.S}[0], [X5] , X1
 | |
|     ADD             X5, X5, X1
 | |
|     LD2             {V8.S, V9.S}[0], [X5] , X1
 | |
|     SUB             X5, X5, X1, LSL #1
 | |
|     LD2             {V4.S, V5.S}[0], [X5] , X1
 | |
|     ADD             X5, X5, X1
 | |
|     LD2             {V12.S, V13.S}[0], [X5] , X1
 | |
| 
 | |
|     LDRB            W12, [X4, #1]
 | |
|     ADD             X6, X6, X12, LSL #3
 | |
|     LD2             {V0.S, V1.S}[1], [X6] , X1
 | |
|     ADD             X6, X6, X1
 | |
|     LD2             {V8.S, V9.S}[1], [X6] , X1
 | |
|     SUB             X6, X6, X1, LSL #1
 | |
|     LD2             {V4.S, V5.S}[1], [X6] , X1
 | |
|     ADD             X6, X6, X1
 | |
|     LD2             {V12.S, V13.S}[1], [X6] , X1
 | |
| 
 | |
|     LDRB            W12, [X4, #2]
 | |
|     ADD             X7, X7, X12, LSL #3
 | |
| 
 | |
|     LD2             {V0.S, V1.S}[2], [X7] , X1
 | |
|     ADD             X7, X7, X1
 | |
|     LD2             {V8.S, V9.S}[2], [X7] , X1
 | |
| 
 | |
| 
 | |
|     LDRB            W12, [X4, #3]
 | |
|     ADD             X11, X11, X12 , LSL #3
 | |
| 
 | |
| 
 | |
|     LD2             {V0.S, V1.S}[3], [X11] , X1
 | |
|     ADD             X11, X11, X1
 | |
|     LD2             {V8.S, V9.S}[3], [X11] , X1
 | |
| 
 | |
|     SUB             X7, X7, X1, LSL #1
 | |
|     ADD             V16.4S, V0.4S, V8.4S
 | |
|     LD2             {V4.S, V5.S}[2], [X7] , X1
 | |
|     ADD             X7, X7, X1
 | |
|     ADD             V18.4S, V1.4S, V9.4S
 | |
|     LD2             {V12.S, V13.S}[2], [X7] , X1
 | |
| 
 | |
|     SUB             X11, X11, X1, LSL #1
 | |
|     SUB             V20.4S, V0.4S, V8.4S
 | |
|     LD2             {V4.S, V5.S}[3], [X11] , X1
 | |
|     ADD             X11, X11, X1
 | |
|     SUB             V22.4S, V1.4S, V9.4S
 | |
|     LD2             {V12.S, V13.S}[3], [X11] , X1
 | |
| 
 | |
|     ADD             X4, X4, #4
 | |
| 
 | |
|     ADD             V24.4S, V4.4S, V12.4S
 | |
|     ADD             V26.4S, V5.4S, V13.4S
 | |
|     SUB             V28.4S, V4.4S, V12.4S
 | |
|     SUB             V30.4S, V5.4S, V13.4S
 | |
| 
 | |
|     ADD             V17.4S, V16.4S, V24.4S
 | |
|     ADD             V11.4S, V18.4S, V26.4S
 | |
|     SUB             V19.4S, V16.4S, V24.4S
 | |
|     SUB             V15.4S, V18.4S, V26.4S
 | |
| 
 | |
|     ADD             V8.4S, V20.4S, V30.4S
 | |
|     SUB             V9.4S, V22.4S, V28.4S
 | |
|     ADD             V13.4S, V22.4S, V28.4S
 | |
|     SUB             V12.4S, V20.4S, V30.4S
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     TRN1            V0.4S, V17.4S, V8.4S
 | |
|     TRN2            V8.4S, V17.4S, V8.4S
 | |
| 
 | |
|     SHL             V0.4S, V0.4S, #1
 | |
|     TRN1            V4.4S, V19.4S, V12.4S
 | |
|     TRN2            V12.4S, V19.4S, V12.4S
 | |
|     SHL             V8.4S, V8.4S, #1
 | |
| 
 | |
|     SHL             V4.4S, V4.4S, #1
 | |
|     TRN1            V1.4S, V11.4S, V9.4S
 | |
|     TRN2            V9.4S, V11.4S, V9.4S
 | |
|     SHL             V12.4S, V12.4S, #1
 | |
| 
 | |
|     SHL             V1.4S, V1.4S, #1
 | |
|     TRN1            V5.4S, V15.4S, V13.4S
 | |
|     TRN2            V13.4S, V15.4S, V13.4S
 | |
|     SHL             V9.4S, V9.4S, #1
 | |
| 
 | |
|     SHL             V5.4S, V5.4S, #1
 | |
|     swp             V4.D[0], V0.D[1]
 | |
|     SHL             V13.4S, V13.4S, #1
 | |
| 
 | |
|     swp             V12.D[0], V8.D[1]
 | |
| 
 | |
| 
 | |
|     swp             V5.D[0], V1.D[1]
 | |
|     swp             V13.D[0], V9.D[1]
 | |
| 
 | |
|     MOV             X15, #32
 | |
|     ST2             {V0.4S, V1.4S}, [X3], X15
 | |
|     ST2             {V8.4S, V9.4S}, [X3], X15
 | |
|     ST2             {V4.4S, V5.4S}, [X3], X15
 | |
|     ST2             {V12.4S, V13.4S}, [X3], X15
 | |
| 
 | |
| 
 | |
|     SUBS            W9, W9, #1
 | |
|     BNE             RADIX_4_LOOP
 | |
| 
 | |
|     LSR             X1, X1, #1
 | |
|     SUB             X3, X3, X1, LSL #3
 | |
|     MOV             X5, #4
 | |
|     MOV             X4, #64
 | |
|     LSR             X6, X1, #4
 | |
| 
 | |
| 
 | |
| RADIX_4_FIRST_ENDS:
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     MOV             x30, X3
 | |
|     LSR             X5, X5, #2
 | |
| 
 | |
| OUTER_LOOP_R4:
 | |
| 
 | |
| 
 | |
|     MOV             X14, x30
 | |
| 
 | |
|     MOV             X7, X5
 | |
|     MOV             X2, #0
 | |
|     MOV             X9, X0
 | |
|     LSL             X12, X5, #5
 | |
| MIDDLE_LOOP_R4:
 | |
| 
 | |
| 
 | |
|     LD2             {V20.H, V21.H}[0], [X9], X2
 | |
|     LD2             {V22.H, V23.H}[0], [X9], X2
 | |
|     ADD             X11, X2, X4, LSL #2
 | |
|     LD2             {V24.H, V25.H}[0], [X9]
 | |
|     ADD             X10, X0, X11
 | |
| 
 | |
|     LD2             {V20.H, V21.H}[1], [X10], X11
 | |
|     LD2             {V22.H, V23.H}[1], [X10], X11
 | |
|     ADD             X2, X11, X4, LSL #2
 | |
|     LD2             {V24.H, V25.H}[1], [X10]
 | |
|     ADD             X9, X0, X2
 | |
| 
 | |
|     LD2             {V20.H, V21.H}[2], [X9], X2
 | |
|     LD2             {V22.H, V23.H}[2], [X9], X2
 | |
|     ADD             X11, X2, X4, LSL #2
 | |
|     LD2             {V24.H, V25.H}[2], [X9]
 | |
|     ADD             X10, X0, X11
 | |
| 
 | |
|     LD2             {V20.H, V21.H}[3], [X10], X11
 | |
|     LD2             {V22.H, V23.H}[3], [X10], X11
 | |
|     ADD             X2, X11, X4, LSL #2
 | |
|     LD2             {V24.H, V25.H}[3], [X10]
 | |
|     ADD             X9, X0, X2
 | |
| 
 | |
|     MOV             X10, X6
 | |
| INNER_LOOP_R4:
 | |
| 
 | |
|     LD2             {V30.4S, V31.4S}, [X14], X12
 | |
|     SSHR            V30.4S, V30.4S, #1
 | |
|     LD4             {V16.4H, V17.4H, V18.4H, V19.4H}, [X14], X12
 | |
|     SSHR            V31.4S, V31.4S, #1
 | |
| 
 | |
|     USHR            V16.4H, V16.4H, #1
 | |
|     LD4             {V26.4H, V27.4H, V28.4H, V29.4H}, [X14], X12
 | |
|     USHR            V18.4H, V18.4H, #1
 | |
| 
 | |
|     SMULL           V11.4S, V16.4H, V20.4H
 | |
|     SMLSL           V11.4S, V18.4H, V21.4H
 | |
|     LD4             {V0.4H, V1.4H, V2.4H, V3.4H}, [X14], X12
 | |
|     SMULL           V12.4S, V16.4H, V21.4H
 | |
|     SMLAL           V12.4S, V18.4H, V20.4H
 | |
| 
 | |
|     USHR            V26.4H, V26.4H, #1
 | |
|     USHR            V28.4H, V28.4H, #1
 | |
| 
 | |
|     LSL             x29, X12, #2
 | |
|     SUB             X14, X14, X12, LSL #2
 | |
| 
 | |
|     USHR            V0.4H, V0.4H, #1
 | |
|     USHR            V2.4H, V2.4H, #1
 | |
| 
 | |
|     SMULL           V13.4S, V26.4H, V22.4H
 | |
|     SMLSL           V13.4S, V28.4H, V23.4H
 | |
| 
 | |
|     SSHR            V11.4S, V11.4S, #15
 | |
| 
 | |
|     SMULL           V14.4S, V26.4H, V23.4H
 | |
|     SMLAL           V14.4S, V28.4H, V22.4H
 | |
| 
 | |
|     SMULL           V15.4S, V0.4H, V24.4H
 | |
|     SMLSL           V15.4S, V2.4H, V25.4H
 | |
| 
 | |
|     SMLAL           V11.4S, V17.4H, V20.4H
 | |
|     SMLSL           V11.4S, V19.4H, V21.4H
 | |
| 
 | |
|     SSHR            V12.4S, V12.4S, #15
 | |
|     SSHR            V13.4S, V13.4S, #15
 | |
|     SSHR            V14.4S, V14.4S, #15
 | |
|     SSHR            V15.4S, V15.4S, #15
 | |
| 
 | |
|     SMLAL           V12.4S, V17.4H, V21.4H
 | |
|     SMLAL           V12.4S, V19.4H, V20.4H
 | |
| 
 | |
|     SMULL           V5.4S, V0.4H, V25.4H
 | |
|     SMLAL           V5.4S, V2.4H, V24.4H
 | |
| 
 | |
|     SMLAL           V13.4S, V27.4H, V22.4H
 | |
|     SMLSL           V13.4S, V29.4H, V23.4H
 | |
| 
 | |
|     SMLAL           V14.4S, V27.4H, V23.4H
 | |
|     SMLAL           V14.4S, V29.4H, V22.4H
 | |
| 
 | |
|     SMLAL           V15.4S, V1.4H, V24.4H
 | |
|     SMLSL           V15.4S, V3.4H, V25.4H
 | |
| 
 | |
|     SSHR            V5.4S, V5.4S, #15
 | |
| 
 | |
|     SMLAL           V5.4S, V1.4H, V25.4H
 | |
|     SMLAL           V5.4S, V3.4H, V24.4H
 | |
| 
 | |
| 
 | |
| 
 | |
|     SUBS            x17, X7, X5
 | |
|     BNE             BYPASS_IF
 | |
| 
 | |
|     ADD             X14, X14, X12
 | |
| 
 | |
|     LDR             W3, [X14]
 | |
|     ADD             X14, X14, X12
 | |
|     ASR             W3, W3, #1
 | |
|     MOV             V11.S[0], W3
 | |
| 
 | |
|     LDR             W3, [X14]
 | |
|     ADD             X14, X14, X12
 | |
|     ASR             W3, W3, #1
 | |
|     MOV             V13.S[0], W3
 | |
| 
 | |
|     LDR             W3, [X14]
 | |
|     ASR             W3, W3, #1
 | |
|     MOV             V15.S[0], W3
 | |
| 
 | |
|     SUB             X14, X14, X12, LSL #1
 | |
|     ADD             X14, X14, #4
 | |
| 
 | |
|     LDR             W3, [X14]
 | |
|     ADD             X14, X14, X12
 | |
|     ASR             W3, W3, #1
 | |
|     MOV             V12.S[0], W3
 | |
| 
 | |
|     LDR             W3, [X14]
 | |
|     ADD             X14, X14, X12
 | |
|     ASR             W3, W3, #1
 | |
|     MOV             V14.S[0], W3
 | |
| 
 | |
|     LDR             W3, [X14]
 | |
|     ADD             X14, X14, X12
 | |
|     ASR             W3, W3, #1
 | |
|     MOV             V5.S[0], W3
 | |
| 
 | |
|     SUB             X14, X14, #4
 | |
| 
 | |
|     SUB             X14, X14, x29
 | |
| 
 | |
| BYPASS_IF:
 | |
| 
 | |
|     ADD             V6.4S, V30.4S, V13.4S
 | |
|     ADD             V7.4S, V31.4S, V14.4S
 | |
|     SUB             V30.4S, V30.4S, V13.4S
 | |
|     SUB             V31.4S, V31.4S, V14.4S
 | |
|     ADD             V8.4S, V11.4S, V15.4S
 | |
|     ADD             V9.4S, V12.4S, V5.4S
 | |
| 
 | |
|     SUB             V15.4S, V11.4S, V15.4S
 | |
|     SUB             V14.4S, V12.4S, V5.4S
 | |
| 
 | |
| 
 | |
|     ADD             V10.4S, V6.4S, V8.4S
 | |
|     ADD             V11.4S, V7.4S, V9.4S
 | |
|     ADD             V12.4S, V30.4S, V14.4S
 | |
|     SUB             V13.4S, V31.4S, V15.4S
 | |
| 
 | |
|     SUB             V6.4S, V6.4S, V8.4S
 | |
|     ST2             {V10.4S, V11.4S}, [X14], X12
 | |
|     SUB             V7.4S, V7.4S, V9.4S
 | |
| 
 | |
|     SUB             V8.4S, V30.4S, V14.4S
 | |
|     ST2             {V12.4S, V13.4S}, [X14], X12
 | |
|     ADD             V9.4S, V31.4S, V15.4S
 | |
| 
 | |
|     ST2             {V6.4S, V7.4S}, [X14], X12
 | |
|     ST2             {V8.4S, V9.4S}, [X14], X12
 | |
|     SUBS            X10, X10, #1
 | |
|     BNE             INNER_LOOP_R4
 | |
| 
 | |
|     SUB             X14, X14, X1, LSL #3
 | |
|     ADD             X14, X14, #32
 | |
| 
 | |
|     SUBS            X7, X7, #1
 | |
|     BNE             MIDDLE_LOOP_R4
 | |
| 
 | |
|     LSR             X4, X4, #2
 | |
|     LSL             X5, X5, #2
 | |
|     LSR             X6, X6, #2
 | |
|     SUBS            X8, X8, #1
 | |
|     BNE             OUTER_LOOP_R4
 | |
| END_LOOPS:
 | |
|     pop_v_regs
 | |
|     RET
 |