778 lines
19 KiB
ArmAsm
778 lines
19 KiB
ArmAsm
///******************************************************************************
|
|
// *
|
|
// * Copyright (C) 2018 The Android Open Source Project
|
|
// *
|
|
// * Licensed under the Apache License, Version 2.0 (the "License");
|
|
// * you may not use this file except in compliance with the License.
|
|
// * You may obtain a copy of the License at:
|
|
// *
|
|
// * http://www.apache.org/licenses/LICENSE-2.0
|
|
// *
|
|
// * Unless required by applicable law or agreed to in writing, software
|
|
// * distributed under the License is distributed on an "AS IS" BASIS,
|
|
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// * See the License for the specific language governing permissions and
|
|
// * limitations under the License.
|
|
// *
|
|
// *****************************************************************************
|
|
// * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
|
|
.macro push_v_regs
|
|
stp d8, d9, [sp, #-16]!
|
|
stp d10, d11, [sp, #-16]!
|
|
stp d12, d13, [sp, #-16]!
|
|
stp d14, d15, [sp, #-16]!
|
|
stp X8, X9, [sp, #-16]!
|
|
stp X10, X11, [sp, #-16]!
|
|
stp X12, X13, [sp, #-16]!
|
|
stp X14, X15, [sp, #-16]!
|
|
stp X16, X17, [sp, #-16]!
|
|
stp X29, X30, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp X29, X30, [sp], #16
|
|
ldp X16, X17, [sp], #16
|
|
ldp X14, X15, [sp], #16
|
|
ldp X12, X13, [sp], #16
|
|
ldp X10, X11, [sp], #16
|
|
ldp X8, X9, [sp], #16
|
|
ldp d14, d15, [sp], #16
|
|
ldp d12, d13, [sp], #16
|
|
ldp d10, d11, [sp], #16
|
|
ldp d8, d9, [sp], #16
|
|
.endm
|
|
|
|
.macro swp reg1, reg2
|
|
MOV x16, \reg1
|
|
MOV \reg1, \reg2
|
|
MOV \reg2, x16
|
|
.endm
|
|
.text
|
|
.p2align 2
|
|
.global ixheaacd_sbr_imdct_using_fft
|
|
ixheaacd_sbr_imdct_using_fft:
|
|
push_v_regs
|
|
|
|
|
|
COND_6: cmp x1, #0x10
|
|
bne COND_7
|
|
MOV X8, #1
|
|
MOV X4, X7
|
|
B RADIX_4_FIRST_START
|
|
|
|
COND_7: cmp x1, #0x20
|
|
|
|
mov x8, #1
|
|
mov x4, x7
|
|
|
|
|
|
RADIX_8_FIRST_START:
|
|
|
|
LSR W9 , W1, #5
|
|
LSL W1, W1, #1
|
|
|
|
RADIX_8_FIRST_LOOP:
|
|
|
|
MOV X5 , X2
|
|
MOV X6 , X2
|
|
MOV X7 , X2
|
|
MOV X11 , X2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LDRB W12, [X4]
|
|
ADD X5, X5, X12, LSL #3
|
|
LD2 {V0.S, V1.S}[0], [X5], X1
|
|
ADD X5, X5, X1
|
|
LD2 {V4.S, V5.S}[0], [X5], X1
|
|
SUB X5, X5, X1, LSL #1
|
|
LD2 {V2.S, V3.S}[0], [X5], X1
|
|
ADD X5, X5, X1
|
|
LD2 {V6.S, V7.S}[0], [X5], X1
|
|
SUB X5, X5, X1, LSL #2
|
|
|
|
LDRB W12, [X4, #1]
|
|
ADD X6, X6, X12, LSL #3
|
|
LD2 {V0.S, V1.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {V4.S, V5.S}[1], [X6] , X1
|
|
SUB X6, X6, X1, LSL #1
|
|
LD2 {V2.S, V3.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {V6.S, V7.S}[1], [X6], X1
|
|
SUB X6, X6, X1, LSL #2
|
|
|
|
|
|
LDRB W12, [X4, #2]
|
|
ADD X7, X7, X12, LSL #3
|
|
LD2 {V0.S, V1.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
LD2 {V4.S, V5.S}[2], [X7] , X1
|
|
SUB X7, X7, X1, LSL #1
|
|
|
|
LDRB W12, [X4, #3]
|
|
ADD X11, X11, X12, LSL #3
|
|
LD2 {V0.S, V1.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
LD2 {V4.S, V5.S}[3], [X11] , X1
|
|
SUB X11, X11, X1, LSL #1
|
|
|
|
|
|
ADD V8.4S, V0.4S, V4.4S
|
|
LD2 {V2.S, V3.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
|
|
|
|
SUB V9.4S, V0.4S, V4.4S
|
|
LD2 {V6.S, V7.S}[2], [X7], X1
|
|
SUB X7, X7, X1, LSL #2
|
|
|
|
|
|
ADD V0.4S, V1.4S, V5.4S
|
|
LD2 {V2.S, V3.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
|
|
SUB V4.4S, V1.4S, V5.4S
|
|
LD2 {V6.S, V7.S}[3], [X11], X1
|
|
SUB X11, X11, X1, LSL #2
|
|
|
|
ADD X4, X4, #4
|
|
|
|
ADD X5, X5, X1, LSR #1
|
|
ADD X6, X6, X1, LSR #1
|
|
ADD X7, X7, X1, LSR #1
|
|
ADD X11, X11, X1, LSR #1
|
|
|
|
|
|
ADD V1.4S, V2.4S, V6.4S
|
|
LD2 {V14.S, V15.S}[0], [X5] , X1
|
|
|
|
|
|
SUB V5.4S, V2.4S, V6.4S
|
|
LD2 {V10.S, V11.S}[0], [X5] , X1
|
|
|
|
|
|
ADD V2.4S, V3.4S, V7.4S
|
|
LD2 {V12.S, V13.S}[0], [X5] , X1
|
|
|
|
|
|
SUB V6.4S, V3.4S, V7.4S
|
|
LD2 {V14.S, V15.S}[1], [X6] , X1
|
|
|
|
ADD V3.4S, V9.4S, V6.4S
|
|
LD2 {V10.S, V11.S}[1], [X6] , X1
|
|
|
|
SUB V7.4S, V9.4S, V6.4S
|
|
LD2 {V12.S, V13.S}[1], [X6] , X1
|
|
|
|
SUB V6.4S, V4.4S, V5.4S
|
|
LD2 {V14.S, V15.S}[2], [X7] , X1
|
|
|
|
ADD V9.4S, V4.4S, V5.4S
|
|
LD2 {V10.S, V11.S}[2], [X7] , X1
|
|
|
|
ADD V4.4S, V8.4S, V1.4S
|
|
LD2 {V12.S, V13.S}[2], [X7] , X1
|
|
|
|
SUB V5.4S, V8.4S, V1.4S
|
|
LD2 {V14.S, V15.S}[3], [X11] , X1
|
|
|
|
ADD V8.4S, V0.4S, V2.4S
|
|
LD2 {V10.S, V11.S}[3], [X11] , X1
|
|
|
|
SUB V0.4S, V0.4S, V2.4S
|
|
LD2 {V12.S, V13.S}[3], [X11] , X1
|
|
|
|
|
|
LD2 {V1.S, V2.S}[0], [X5], X1
|
|
|
|
ADD V17.4S, V14.4S, V12.4S
|
|
|
|
LD2 {V1.S, V2.S}[1], [X6] , X1
|
|
|
|
SUB V16.4S, V14.4S, V12.4S
|
|
|
|
LD2 {V1.S, V2.S}[2], [X7] , X1
|
|
|
|
ADD V14.4S, V15.4S, V13.4S
|
|
|
|
LD2 {V1.S, V2.S}[3], [X11] , X1
|
|
|
|
SUB V12.4S, V15.4S, V13.4S
|
|
|
|
ADD V15.4S, V10.4S, V1.4S
|
|
SUB V13.4S, V10.4S, V1.4S
|
|
ADD V10.4S, V11.4S, V2.4S
|
|
SUB V1.4S, V11.4S, V2.4S
|
|
|
|
ADD V11.4S, V17.4S, V15.4S
|
|
SUB V2.4S, V17.4S, V15.4S
|
|
ADD V17.4S, V14.4S, V10.4S
|
|
SUB V15.4S, V14.4S, V10.4S
|
|
|
|
ADD V14.4S, V16.4S, V12.4S
|
|
SUB V10.4S, V16.4S, V12.4S
|
|
ADD V16.4S, V13.4S, V1.4S
|
|
SUB V12.4S, V13.4S, V1.4S
|
|
|
|
ADD V1.4S , V14.4S, V12.4S
|
|
SUB V13.4S, V14.4S, V12.4S
|
|
SUB V12.4S, V16.4S, V10.4S
|
|
|
|
UZP1 V22.8H, V1.8H, V1.8H
|
|
UZP2 V23.8H, V1.8H, V1.8H
|
|
ADD V14.4S, V16.4S, V10.4S
|
|
|
|
UZP1 V26.8H, V13.8H, V13.8H
|
|
UZP2 V27.8H, V13.8H, V13.8H
|
|
ADD V16.4S, V4.4S, V11.4S
|
|
|
|
UZP1 V24.8H, V12.8H, V12.8H
|
|
UZP2 V25.8H, V12.8H, V12.8H
|
|
SUB V10.4S, V4.4S, V11.4S
|
|
|
|
UZP1 V28.8H, V14.8H, V14.8H
|
|
UZP2 V29.8H, V14.8H, V14.8H
|
|
ADD V4.4S, V8.4S, V17.4S
|
|
|
|
MOV W14, #0x5a82
|
|
|
|
SUB V11.4S, V8.4S, V17.4S
|
|
|
|
ADD V8.4S, V5.4S, V15.4S
|
|
SUB V17.4S, V5.4S, V15.4S
|
|
SUB V5.4S, V0.4S, V2.4S
|
|
ADD V15.4S, V0.4S, V2.4S
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DUP V31.4H, W14
|
|
|
|
UMULL V19.4S, V26.4H, V31.4H
|
|
UMULL V18.4S, V28.4H, V31.4H
|
|
SSHR V19.4S, V19.4S, #15
|
|
SSHR V18.4S, V18.4S, #15
|
|
|
|
SQDMLAL V19.4S, V27.4H, V31.4H
|
|
SQDMLAL V18.4S, V29.4H, V31.4H
|
|
|
|
UMULL V13.4S, V24.4H, V31.4H
|
|
UMULL V14.4S, V22.4H, V31.4H
|
|
|
|
ADD V20.4S, V3.4S, V19.4S
|
|
SUB V21.4S, V3.4S, V19.4S
|
|
ADD V30.4S, V6.4S, V18.4S
|
|
SUB V6.4S, V6.4S, V18.4S
|
|
|
|
SSHR V13.4S, V13.4S, #15
|
|
SSHR V14.4S, V14.4S, #15
|
|
|
|
SQDMLAL V13.4S, V25.4H, V31.4H
|
|
SQDMLAL V14.4S, V23.4H, V31.4H
|
|
|
|
ADD V3.4S, V7.4S, V13.4S
|
|
SUB V19.4S, V7.4S, V13.4S
|
|
ADD V1.4S, V9.4S, V14.4S
|
|
SUB V18.4S, V9.4S, V14.4S
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
swp V17.D[0], V8.D[0]
|
|
swp V17.D[1], V8.D[1]
|
|
swp V4.D[0], V16.D[0]
|
|
swp V4.D[1], V16.D[1]
|
|
|
|
TRN1 V12.4S, V4.4S, V20.4S
|
|
TRN2 V22.4S, V4.4S, V20.4S
|
|
|
|
SHL V12.4S, V12.4S, #1
|
|
TRN1 V9.4S, V17.4S, V3.4S
|
|
TRN2 V2.4S, V17.4S, V3.4S
|
|
SHL V22.4S, V22.4S, #1
|
|
|
|
SHL V9.4S, V9.4S, #1
|
|
TRN1 V24.4S, V10.4S, V21.4S
|
|
TRN2 V7.4S, V10.4S, V21.4S
|
|
SHL V2.4S, V2.4S, #1
|
|
|
|
SHL V24.4S, V24.4S, #1
|
|
TRN1 V13.4S, V16.4S, V6.4S
|
|
TRN2 V23.4S, V16.4S, V6.4S
|
|
SHL V7.4S, V7.4S, #1
|
|
|
|
SHL V13.4S, V13.4S, #1
|
|
TRN1 V10.4S, V5.4S, V18.4S
|
|
TRN2 V3.4S, V5.4S, V18.4S
|
|
SHL V23.4S, V23.4S, #1
|
|
|
|
SHL V10.4S, V10.4S, #1
|
|
TRN1 V26.4S, V8.4S, V19.4S
|
|
TRN2 V4.4S, V8.4S, V19.4S
|
|
SHL V3.4S, V3.4S, #1
|
|
|
|
SHL V26.4S, V26.4S, #1
|
|
TRN1 V25.4S, V11.4S, V30.4S
|
|
TRN2 V8.4S, V11.4S, V30.4S
|
|
SHL V4.4S, V4.4S, #1
|
|
|
|
SHL V25.4S, V25.4S, #1
|
|
TRN1 V27.4S, V15.4S, V1.4S
|
|
TRN2 V5.4S, V15.4S, V1.4S
|
|
SHL V8.4S, V8.4S, #1
|
|
|
|
SHL V27.4S, V27.4S, #1
|
|
swp V9.D[0], V12.D[1]
|
|
SHL V5.4S, V5.4S, #1
|
|
swp V2.D[0], V22.D[1]
|
|
|
|
swp V24.D[1], V26.D[0]
|
|
swp V7.D[1], V4.D[0]
|
|
swp V10.D[0], V13.D[1]
|
|
swp V3.D[0], V23.D[1]
|
|
swp V27.D[0], V25.D[1]
|
|
swp V5.D[0], V8.D[1]
|
|
|
|
|
|
MOV X15, #32
|
|
ST2 {V12.4S, V13.4S}, [X3], X15
|
|
ST2 {V24.4S, V25.4S}, [X3], X15
|
|
ST2 {V22.4S, V23.4S}, [X3], X15
|
|
ST2 {V7.4S, V8.4S}, [X3], X15
|
|
ST2 {V9.4S, V10.4S}, [X3], X15
|
|
ST2 {V26.4S, V27.4S}, [X3], X15
|
|
ST2 {V2.4S, V3.4S}, [X3], X15
|
|
ST2 {V4.4S, V5.4S}, [X3], X15
|
|
|
|
|
|
SUBS X9, X9, #1
|
|
BNE RADIX_8_FIRST_LOOP
|
|
|
|
LSR X1, X1, #1
|
|
LSL X15, X1, #3
|
|
SUB X3, X3, X15
|
|
|
|
MOV X5, #8
|
|
MOV X4, #32
|
|
LSR X15, X1, #5
|
|
MOV X6, X15
|
|
B RADIX_4_FIRST_ENDS
|
|
|
|
RADIX_8_FIRST_ENDS:
|
|
|
|
|
|
|
|
RADIX_4_FIRST_START:
|
|
|
|
|
|
LSR W9, W1, #4
|
|
LSL W1, W1, #1
|
|
|
|
RADIX_4_LOOP:
|
|
|
|
MOV X5 , X2
|
|
MOV X6 , X2
|
|
MOV X7 , X2
|
|
MOV X11 , X2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LDRB W12, [X4, #0]
|
|
ADD X5, X5, X12, LSL #3
|
|
|
|
LD2 {V0.S, V1.S}[0], [X5] , X1
|
|
ADD X5, X5, X1
|
|
LD2 {V8.S, V9.S}[0], [X5] , X1
|
|
SUB X5, X5, X1, LSL #1
|
|
LD2 {V4.S, V5.S}[0], [X5] , X1
|
|
ADD X5, X5, X1
|
|
LD2 {V12.S, V13.S}[0], [X5] , X1
|
|
|
|
LDRB W12, [X4, #1]
|
|
ADD X6, X6, X12, LSL #3
|
|
LD2 {V0.S, V1.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {V8.S, V9.S}[1], [X6] , X1
|
|
SUB X6, X6, X1, LSL #1
|
|
LD2 {V4.S, V5.S}[1], [X6] , X1
|
|
ADD X6, X6, X1
|
|
LD2 {V12.S, V13.S}[1], [X6] , X1
|
|
|
|
LDRB W12, [X4, #2]
|
|
ADD X7, X7, X12, LSL #3
|
|
|
|
LD2 {V0.S, V1.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
LD2 {V8.S, V9.S}[2], [X7] , X1
|
|
|
|
|
|
LDRB W12, [X4, #3]
|
|
ADD X11, X11, X12 , LSL #3
|
|
|
|
|
|
LD2 {V0.S, V1.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
LD2 {V8.S, V9.S}[3], [X11] , X1
|
|
|
|
SUB X7, X7, X1, LSL #1
|
|
ADD V16.4S, V0.4S, V8.4S
|
|
LD2 {V4.S, V5.S}[2], [X7] , X1
|
|
ADD X7, X7, X1
|
|
ADD V18.4S, V1.4S, V9.4S
|
|
LD2 {V12.S, V13.S}[2], [X7] , X1
|
|
|
|
SUB X11, X11, X1, LSL #1
|
|
SUB V20.4S, V0.4S, V8.4S
|
|
LD2 {V4.S, V5.S}[3], [X11] , X1
|
|
ADD X11, X11, X1
|
|
SUB V22.4S, V1.4S, V9.4S
|
|
LD2 {V12.S, V13.S}[3], [X11] , X1
|
|
|
|
ADD X4, X4, #4
|
|
|
|
ADD V24.4S, V4.4S, V12.4S
|
|
ADD V26.4S, V5.4S, V13.4S
|
|
SUB V28.4S, V4.4S, V12.4S
|
|
SUB V30.4S, V5.4S, V13.4S
|
|
|
|
ADD V17.4S, V16.4S, V24.4S
|
|
ADD V11.4S, V18.4S, V26.4S
|
|
SUB V19.4S, V16.4S, V24.4S
|
|
SUB V15.4S, V18.4S, V26.4S
|
|
|
|
ADD V8.4S, V20.4S, V30.4S
|
|
SUB V9.4S, V22.4S, V28.4S
|
|
ADD V13.4S, V22.4S, V28.4S
|
|
SUB V12.4S, V20.4S, V30.4S
|
|
|
|
|
|
|
|
|
|
TRN1 V0.4S, V17.4S, V8.4S
|
|
TRN2 V8.4S, V17.4S, V8.4S
|
|
|
|
SHL V0.4S, V0.4S, #1
|
|
TRN1 V4.4S, V19.4S, V12.4S
|
|
TRN2 V12.4S, V19.4S, V12.4S
|
|
SHL V8.4S, V8.4S, #1
|
|
|
|
SHL V4.4S, V4.4S, #1
|
|
TRN1 V1.4S, V11.4S, V9.4S
|
|
TRN2 V9.4S, V11.4S, V9.4S
|
|
SHL V12.4S, V12.4S, #1
|
|
|
|
SHL V1.4S, V1.4S, #1
|
|
TRN1 V5.4S, V15.4S, V13.4S
|
|
TRN2 V13.4S, V15.4S, V13.4S
|
|
SHL V9.4S, V9.4S, #1
|
|
|
|
SHL V5.4S, V5.4S, #1
|
|
swp V4.D[0], V0.D[1]
|
|
SHL V13.4S, V13.4S, #1
|
|
|
|
swp V12.D[0], V8.D[1]
|
|
|
|
|
|
swp V5.D[0], V1.D[1]
|
|
swp V13.D[0], V9.D[1]
|
|
|
|
MOV X15, #32
|
|
ST2 {V0.4S, V1.4S}, [X3], X15
|
|
ST2 {V8.4S, V9.4S}, [X3], X15
|
|
ST2 {V4.4S, V5.4S}, [X3], X15
|
|
ST2 {V12.4S, V13.4S}, [X3], X15
|
|
|
|
|
|
SUBS W9, W9, #1
|
|
BNE RADIX_4_LOOP
|
|
|
|
LSR X1, X1, #1
|
|
SUB X3, X3, X1, LSL #3
|
|
MOV X5, #4
|
|
MOV X4, #64
|
|
LSR X6, X1, #4
|
|
|
|
|
|
RADIX_4_FIRST_ENDS:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
MOV x30, X3
|
|
LSR X5, X5, #2
|
|
|
|
OUTER_LOOP_R4:
|
|
|
|
|
|
MOV X14, x30
|
|
|
|
MOV X7, X5
|
|
MOV X2, #0
|
|
MOV X9, X0
|
|
LSL X12, X5, #5
|
|
MIDDLE_LOOP_R4:
|
|
|
|
|
|
LD2 {V20.H, V21.H}[0], [X9], X2
|
|
LD2 {V22.H, V23.H}[0], [X9], X2
|
|
ADD X11, X2, X4, LSL #2
|
|
LD2 {V24.H, V25.H}[0], [X9]
|
|
ADD X10, X0, X11
|
|
|
|
LD2 {V20.H, V21.H}[1], [X10], X11
|
|
LD2 {V22.H, V23.H}[1], [X10], X11
|
|
ADD X2, X11, X4, LSL #2
|
|
LD2 {V24.H, V25.H}[1], [X10]
|
|
ADD X9, X0, X2
|
|
|
|
LD2 {V20.H, V21.H}[2], [X9], X2
|
|
LD2 {V22.H, V23.H}[2], [X9], X2
|
|
ADD X11, X2, X4, LSL #2
|
|
LD2 {V24.H, V25.H}[2], [X9]
|
|
ADD X10, X0, X11
|
|
|
|
LD2 {V20.H, V21.H}[3], [X10], X11
|
|
LD2 {V22.H, V23.H}[3], [X10], X11
|
|
ADD X2, X11, X4, LSL #2
|
|
LD2 {V24.H, V25.H}[3], [X10]
|
|
ADD X9, X0, X2
|
|
|
|
MOV X10, X6
|
|
INNER_LOOP_R4:
|
|
|
|
LD2 {V30.4S, V31.4S}, [X14], X12
|
|
SSHR V30.4S, V30.4S, #1
|
|
LD4 {V16.4H, V17.4H, V18.4H, V19.4H}, [X14], X12
|
|
SSHR V31.4S, V31.4S, #1
|
|
|
|
USHR V16.4H, V16.4H, #1
|
|
LD4 {V26.4H, V27.4H, V28.4H, V29.4H}, [X14], X12
|
|
USHR V18.4H, V18.4H, #1
|
|
|
|
SMULL V11.4S, V16.4H, V20.4H
|
|
SMLSL V11.4S, V18.4H, V21.4H
|
|
LD4 {V0.4H, V1.4H, V2.4H, V3.4H}, [X14], X12
|
|
SMULL V12.4S, V16.4H, V21.4H
|
|
SMLAL V12.4S, V18.4H, V20.4H
|
|
|
|
USHR V26.4H, V26.4H, #1
|
|
USHR V28.4H, V28.4H, #1
|
|
|
|
LSL x29, X12, #2
|
|
SUB X14, X14, X12, LSL #2
|
|
|
|
USHR V0.4H, V0.4H, #1
|
|
USHR V2.4H, V2.4H, #1
|
|
|
|
SMULL V13.4S, V26.4H, V22.4H
|
|
SMLSL V13.4S, V28.4H, V23.4H
|
|
|
|
SSHR V11.4S, V11.4S, #15
|
|
|
|
SMULL V14.4S, V26.4H, V23.4H
|
|
SMLAL V14.4S, V28.4H, V22.4H
|
|
|
|
SMULL V15.4S, V0.4H, V24.4H
|
|
SMLSL V15.4S, V2.4H, V25.4H
|
|
|
|
SMLAL V11.4S, V17.4H, V20.4H
|
|
SMLSL V11.4S, V19.4H, V21.4H
|
|
|
|
SSHR V12.4S, V12.4S, #15
|
|
SSHR V13.4S, V13.4S, #15
|
|
SSHR V14.4S, V14.4S, #15
|
|
SSHR V15.4S, V15.4S, #15
|
|
|
|
SMLAL V12.4S, V17.4H, V21.4H
|
|
SMLAL V12.4S, V19.4H, V20.4H
|
|
|
|
SMULL V5.4S, V0.4H, V25.4H
|
|
SMLAL V5.4S, V2.4H, V24.4H
|
|
|
|
SMLAL V13.4S, V27.4H, V22.4H
|
|
SMLSL V13.4S, V29.4H, V23.4H
|
|
|
|
SMLAL V14.4S, V27.4H, V23.4H
|
|
SMLAL V14.4S, V29.4H, V22.4H
|
|
|
|
SMLAL V15.4S, V1.4H, V24.4H
|
|
SMLSL V15.4S, V3.4H, V25.4H
|
|
|
|
SSHR V5.4S, V5.4S, #15
|
|
|
|
SMLAL V5.4S, V1.4H, V25.4H
|
|
SMLAL V5.4S, V3.4H, V24.4H
|
|
|
|
|
|
|
|
SUBS x17, X7, X5
|
|
BNE BYPASS_IF
|
|
|
|
ADD X14, X14, X12
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOV V11.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOV V13.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ASR W3, W3, #1
|
|
MOV V15.S[0], W3
|
|
|
|
SUB X14, X14, X12, LSL #1
|
|
ADD X14, X14, #4
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOV V12.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOV V14.S[0], W3
|
|
|
|
LDR W3, [X14]
|
|
ADD X14, X14, X12
|
|
ASR W3, W3, #1
|
|
MOV V5.S[0], W3
|
|
|
|
SUB X14, X14, #4
|
|
|
|
SUB X14, X14, x29
|
|
|
|
BYPASS_IF:
|
|
|
|
ADD V6.4S, V30.4S, V13.4S
|
|
ADD V7.4S, V31.4S, V14.4S
|
|
SUB V30.4S, V30.4S, V13.4S
|
|
SUB V31.4S, V31.4S, V14.4S
|
|
ADD V8.4S, V11.4S, V15.4S
|
|
ADD V9.4S, V12.4S, V5.4S
|
|
|
|
SUB V15.4S, V11.4S, V15.4S
|
|
SUB V14.4S, V12.4S, V5.4S
|
|
|
|
|
|
ADD V10.4S, V6.4S, V8.4S
|
|
ADD V11.4S, V7.4S, V9.4S
|
|
ADD V12.4S, V30.4S, V14.4S
|
|
SUB V13.4S, V31.4S, V15.4S
|
|
|
|
SUB V6.4S, V6.4S, V8.4S
|
|
ST2 {V10.4S, V11.4S}, [X14], X12
|
|
SUB V7.4S, V7.4S, V9.4S
|
|
|
|
SUB V8.4S, V30.4S, V14.4S
|
|
ST2 {V12.4S, V13.4S}, [X14], X12
|
|
ADD V9.4S, V31.4S, V15.4S
|
|
|
|
ST2 {V6.4S, V7.4S}, [X14], X12
|
|
ST2 {V8.4S, V9.4S}, [X14], X12
|
|
SUBS X10, X10, #1
|
|
BNE INNER_LOOP_R4
|
|
|
|
SUB X14, X14, X1, LSL #3
|
|
ADD X14, X14, #32
|
|
|
|
SUBS X7, X7, #1
|
|
BNE MIDDLE_LOOP_R4
|
|
|
|
LSR X4, X4, #2
|
|
LSL X5, X5, #2
|
|
LSR X6, X6, #2
|
|
SUBS X8, X8, #1
|
|
BNE OUTER_LOOP_R4
|
|
END_LOOPS:
|
|
pop_v_regs
|
|
RET
|