714 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
			
		
		
	
	
			714 lines
		
	
	
		
			18 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
| ///******************************************************************************
 | |
| // *
 | |
| // * Copyright (C) 2018 The Android Open Source Project
 | |
| // *
 | |
| // * Licensed under the Apache License, Version 2.0 (the "License");
 | |
| // * you may not use this file except in compliance with the License.
 | |
| // * You may obtain a copy of the License at:
 | |
| // *
 | |
| // * http://www.apache.org/licenses/LICENSE-2.0
 | |
| // *
 | |
| // * Unless required by applicable law or agreed to in writing, software
 | |
| // * distributed under the License is distributed on an "AS IS" BASIS,
 | |
| // * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| // * See the License for the specific language governing permissions and
 | |
| // * limitations under the License.
 | |
| // *
 | |
| // *****************************************************************************
 | |
| // * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
 | |
| //*/
 | |
| 
 | |
| 
 | |
| .macro push_v_regs
 | |
|     stp             q8, q9, [sp, #-32]!
 | |
|     stp             q10, q11, [sp, #-32]!
 | |
|     stp             q12, q13, [sp, #-32]!
 | |
|     stp             q14, q15, [sp, #-32]!
 | |
|     stp             x21, x22, [sp, #-16]!
 | |
|     stp             x23, x24, [sp, #-16]!
 | |
| .endm
 | |
| .macro pop_v_regs
 | |
|     ldp             x23, x24, [sp], #16
 | |
|     ldp             x21, x22, [sp], #16
 | |
|     ldp             q14, q15, [sp], #32
 | |
|     ldp             q12, q13, [sp], #32
 | |
|     ldp             q10, q11, [sp], #32
 | |
|     ldp             q8, q9, [sp], #32
 | |
| .endm
 | |
| .macro swp reg1, reg2
 | |
|     MOV             X16, \reg1
 | |
|     MOV             \reg1, \reg2
 | |
|     MOV             \reg2, x16
 | |
| .endm
 | |
| .text
 | |
| .global ixheaacd_post_twiddle_armv8
 | |
| ixheaacd_post_twiddle_armv8:
 | |
| 
 | |
| 
 | |
|     push_v_regs
 | |
| 
 | |
| ARM_PROLOGUE:
 | |
|     CMP             w3, #0x400
 | |
|     MOV             x21, #7500
 | |
|     ADD             x2, x2, x21
 | |
|     BLT             NEXT
 | |
|     MOV             w4, #50
 | |
|     MOV             w5, #-50
 | |
|     MOV             x6, #4
 | |
|     dup             v10.4h, w4
 | |
|     B               NEXT1
 | |
| 
 | |
| NEXT:
 | |
|     MOV             w4, #0x192
 | |
|     MOV             w5, #0xfe6e
 | |
|     MOV             x6, #32
 | |
|     dup             v10.4h, w4
 | |
| 
 | |
| NEXT1:
 | |
|     LDR             w9, [x2]
 | |
|     LSL             W22, W9, #16
 | |
|     AND             W21, W9, #0xFFFF0000
 | |
| 
 | |
|     LDR             w7, [x1], #4
 | |
|     LDR             w8, [x1], #4
 | |
| 
 | |
|     ADD             x2, x2, x6
 | |
| 
 | |
| 
 | |
|     SMULL           X11, w8, w21
 | |
|     ASR             X11, x11, #32
 | |
|     SMULL           X10, w8, w22
 | |
|     ASR             X10, x10, #32
 | |
|     SMULL           X12, w7, w21
 | |
|     ASR             X12, x12, #32
 | |
|     SMULL           X23, w7, w22
 | |
|     ASR             X23, x23, #32
 | |
|     ADD             w8, w11, w23
 | |
| 
 | |
| 
 | |
|     SUB             w10, w10, w12
 | |
| 
 | |
|     MVN             w8, w8
 | |
|     ADD             w8, w8, #1
 | |
| 
 | |
| 
 | |
| 
 | |
|     LSL             w21, w5, #16
 | |
|     LSL             w22, w4, #16
 | |
|     SMULL           X23, w10, w21
 | |
|     ASR             X23, x23, #32
 | |
|     ADD             w9, w8, w23
 | |
|     SMULL           X23, w8, w22
 | |
|     ASR             X23, x23, #32
 | |
|     ADD             w11, w10, w23
 | |
| 
 | |
|     LSL             x7, x3, #2
 | |
|     ADD             x7, x0, x7
 | |
|     SUB             x7, x7, #4
 | |
| 
 | |
|     STR             w11, [x7], #-4
 | |
| 
 | |
|     STR             w9, [x0], #4
 | |
| 
 | |
|     LSL             x5, x3, #2
 | |
|     ADD             x5, x1, x5
 | |
|     SUB             x5, x5, #40
 | |
| 
 | |
| 
 | |
|     SUB             w3, w3, #1
 | |
|     ASR             w3, w3, #4
 | |
| 
 | |
| 
 | |
|     SUB             x7, x7, #28
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     MOV             x8, #-32
 | |
| 
 | |
| NEON_PROLOGUE:
 | |
| 
 | |
|     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
 | |
| 
 | |
|     LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
 | |
|     LD2             {v8.h, v9.h}[0], [x2], x6
 | |
|     LD2             {v8.h, v9.h}[1], [x2], x6
 | |
|     LD2             {v8.h, v9.h}[2], [x2], x6
 | |
|     LD2             {v8.h, v9.h}[3], [x2], x6
 | |
| 
 | |
|     rev64           v12.4h, v8.4h
 | |
|     rev64           v13.4h, v9.4h
 | |
| 
 | |
|     uMULL           v30.4s, v2.4h, v13.4h
 | |
|     uMULL           v28.4s, v0.4h, v13.4h
 | |
|     uMULL           v26.4s, v2.4h, v12.4h
 | |
|     uMULL           v24.4s, v0.4h, v12.4h
 | |
| 
 | |
|     ushR            v30.4s, v30.4s, #16
 | |
|     ushR            v28.4s, v28.4s, #16
 | |
|     ushR            v26.4s, v26.4s, #16
 | |
|     ushR            v24.4s, v24.4s, #16
 | |
| 
 | |
|     sMLAL           v30.4s, v3.4h, v13.4h
 | |
|     sMLAL           v28.4s, v1.4h, v13.4h
 | |
|     sMLAL           v26.4s, v3.4h, v12.4h
 | |
|     sMLAL           v24.4s, v1.4h, v12.4h
 | |
| 
 | |
|     uMULL           v22.4s, v6.4h, v9.4h
 | |
|     uMULL           v20.4s, v4.4h, v9.4h
 | |
| 
 | |
|     ADD             v28.4s, v28.4s , v26.4s
 | |
|     SUB             v30.4s, v30.4s , v24.4s
 | |
|     NEG             v28.4s, v28.4s
 | |
| 
 | |
|     uMULL           v18.4s, v6.4h, v8.4h
 | |
|     uMULL           v16.4s, v4.4h, v8.4h
 | |
| 
 | |
|     mov             v31.8b, v30.8b
 | |
|     mov             v27.D[0], v30.D[1]
 | |
|     ushR            v22.4s, v22.4s, #16
 | |
| 
 | |
|     mov             v24.8b, v28.8b
 | |
|     mov             v25.D[0], v28.D[1]
 | |
|     ushR            v20.4s, v20.4s, #16
 | |
| 
 | |
| 
 | |
|     UZP1            v26.4h, v31.4h, v27.4h
 | |
|     UZP2            v27.4h, v31.4h, v27.4h
 | |
|     ushR            v18.4s, v18.4s, #16
 | |
| 
 | |
| 
 | |
|     mov             v31.8B , v24.8B
 | |
|     UZP1            v24.4h, v31.4h, v25.4h
 | |
|     UZP2            v25.4h, v31.4h, v25.4h
 | |
|     ushR            v16.4s, v16.4s, #16
 | |
| 
 | |
| 
 | |
|     sMLAL           v22.4s, v7.4h, v9.4h
 | |
|     sMLAL           v20.4s, v5.4h, v9.4h
 | |
|     sMLAL           v18.4s, v7.4h, v8.4h
 | |
|     sMLAL           v16.4s, v5.4h, v8.4h
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[0], [x2], x6
 | |
|     uMULL           v0.4s, v26.4h, v10.4h
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[1], [x2], x6
 | |
|     uMULL           v2.4s, v24.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[2], [x2], x6
 | |
|     ADD             v22.4s, v22.4s , v16.4s
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[3], [x2], x6
 | |
|     SUB             v20.4s, v18.4s , v20.4s
 | |
| 
 | |
|     rev64           v12.4h, v8.4h
 | |
|     rev64           v13.4h, v9.4h
 | |
|     NEG             v22.4s, v22.4s
 | |
| 
 | |
| 
 | |
|     mov             v18.8b, v22.8b
 | |
|     mov             v19.D[0], v22.D[1]
 | |
|     ushR            v0.4s, v0.4s, #16
 | |
| 
 | |
|     mov             v16.16b, v20.16b
 | |
|     mov             v17.D[0], v20.D[1]
 | |
|     ushR            v2.4s, v2.4s, #16
 | |
| 
 | |
| 
 | |
|     MOV             v31.8b, v18.8b
 | |
|     UZP1            v18.4h, v31.4h, v19.4h
 | |
|     UZP2            v19.4h, v31.4h, v19.4h
 | |
|     sMLAL           v0.4s, v27.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     MOV             v31.8b, v16.8b
 | |
|     UZP1            v16.4h, v31.4h, v17.4h
 | |
|     UZP2            v17.4h, v31.4h, v17.4h
 | |
|     sMLAL           v2.4s, v25.4h, v10.4h
 | |
| 
 | |
|     uMULL           v4.4s, v18.4h, v10.4h
 | |
|     uMULL           v6.4s, v16.4h, v10.4h
 | |
| 
 | |
|     NEG             v0.4s, v0.4s
 | |
|     ADD             v14.4s, v30.4s , v2.4s
 | |
|     ADD             v26.4s, v28.4s , v0.4s
 | |
| 
 | |
|     rev64           v14.4s, v14.4s
 | |
|     ushR            v4.4s, v4.4s, #16
 | |
| 
 | |
|     swp             v14.D[0], v14.D[1]
 | |
|     ushR            v6.4s, v6.4s, #16
 | |
| 
 | |
|     sMLAL           v4.4s, v19.4h, v10.4h
 | |
|     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
 | |
|     sMLAL           v6.4s, v17.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     SUB             x3, x3, #2
 | |
| 
 | |
|     ADD             v24.4s, v20.4s , v4.4s
 | |
| 
 | |
|     rev64           v24.4s, v24.4s
 | |
|     NEG             v16.4s, v6.4s
 | |
| 
 | |
|     LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
 | |
| 
 | |
|     swp             v24.D[0], v24.D[1]
 | |
|     ADD             v16.4s, v22.4s , v16.4s
 | |
| 
 | |
| 
 | |
| 
 | |
| CORE_LOOP:
 | |
|     uMULL           v30.4s, v2.4h, v13.4h
 | |
|     MOV             v25.16B, v24.16B
 | |
|     ST2             { v25.4s, v26.4s}, [x7], x8
 | |
|     uMULL           v28.4s, v0.4h, v13.4h
 | |
| 
 | |
|     uMULL           v26.4s, v2.4h, v12.4h
 | |
|     MOV             v15.16B, v14.16B
 | |
|     ST2             { v15.4s, v16.4s}, [x0], #32
 | |
|     uMULL           v24.4s, v0.4h, v12.4h
 | |
| 
 | |
|     ushR            v30.4s, v30.4s, #16
 | |
|     ushR            v28.4s, v28.4s, #16
 | |
|     ushR            v26.4s, v26.4s, #16
 | |
|     ushR            v24.4s, v24.4s, #16
 | |
| 
 | |
|     sMLAL           v30.4s, v3.4h, v13.4h
 | |
|     sMLAL           v28.4s, v1.4h, v13.4h
 | |
|     sMLAL           v26.4s, v3.4h, v12.4h
 | |
|     sMLAL           v24.4s, v1.4h, v12.4h
 | |
| 
 | |
|     uMULL           v22.4s, v6.4h, v9.4h
 | |
|     uMULL           v20.4s, v4.4h, v9.4h
 | |
| 
 | |
| 
 | |
|     ADD             v28.4s, v28.4s , v26.4s
 | |
|     SUB             v30.4s, v30.4s , v24.4s
 | |
|     NEG             v28.4s, v28.4s
 | |
| 
 | |
|     uMULL           v18.4s, v6.4h, v8.4h
 | |
|     uMULL           v16.4s, v4.4h, v8.4h
 | |
| 
 | |
| 
 | |
|     mov             v26.8b, v30.8b
 | |
|     mov             v27.D[0], v30.D[1]
 | |
|     ushR            v22.4s, v22.4s, #16
 | |
| 
 | |
| 
 | |
|     mov             v24.8b, v28.8b
 | |
|     mov             v25.D[0], v28.D[1]
 | |
|     ushR            v20.4s, v20.4s, #16
 | |
| 
 | |
| 
 | |
|     MOV             v31.8b, v26.8b
 | |
|     UZP1            v26.4h, v31.4h, v27.4h
 | |
|     UZP2            v27.4h, v31.4h, v27.4h
 | |
|     ushR            v18.4s, v18.4s, #16
 | |
| 
 | |
| 
 | |
|     MOV             v31.8b, v24.8b
 | |
|     UZP1            v24.4h, v31.4h, v25.4h
 | |
|     UZP2            v25.4h, v31.4h, v25.4h
 | |
|     ushR            v16.4s, v16.4s, #16
 | |
| 
 | |
| 
 | |
|     sMLAL           v22.4s, v7.4h, v9.4h
 | |
|     sMLAL           v20.4s, v5.4h, v9.4h
 | |
|     sMLAL           v18.4s, v7.4h, v8.4h
 | |
|     sMLAL           v16.4s, v5.4h, v8.4h
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[0], [x2], x6
 | |
|     uMULL           v0.4s, v26.4h, v10.4h
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[1], [x2], x6
 | |
|     uMULL           v2.4s, v24.4h, v10.4h
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[2], [x2], x6
 | |
|     ADD             v22.4s, v22.4s , v16.4s
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[3], [x2], x6
 | |
|     SUB             v20.4s, v18.4s , v20.4s
 | |
| 
 | |
|     rev64           v12.4h, v8.4h
 | |
|     rev64           v13.4h, v9.4h
 | |
|     NEG             v22.4s, v22.4s
 | |
| 
 | |
|     mov             v18.8b, v22.8b
 | |
|     mov             v19.D[0], v22.D[1]
 | |
|     ushR            v0.4s, v0.4s, #16
 | |
| 
 | |
|     mov             v16.8b, v20.8b
 | |
|     mov             v17.D[0], v20.D[1]
 | |
|     ushR            v2.4s, v2.4s, #16
 | |
| 
 | |
| 
 | |
|     MOV             v31.8b, v18.8b
 | |
|     UZP1            v18.4h, v31.4h, v19.4h
 | |
|     UZP2            v19.4h, v31.4h, v19.4h
 | |
|     sMLAL           v0.4s, v27.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     MOV             v31.8b, v16.8b
 | |
|     UZP1            v16.4h, v31.4h, v17.4h
 | |
|     UZP2            v17.4h, v31.4h, v17.4h
 | |
|     sMLAL           v2.4s, v25.4h, v10.4h
 | |
| 
 | |
|     uMULL           v4.4s, v18.4h, v10.4h
 | |
|     uMULL           v6.4s, v16.4h, v10.4h
 | |
| 
 | |
|     NEG             v0.4s, v0.4s
 | |
|     ADD             v14.4s, v30.4s , v2.4s
 | |
|     ADD             v26.4s, v28.4s , v0.4s
 | |
| 
 | |
|     rev64           v14.4s, v14.4s
 | |
|     ushR            v4.4s, v4.4s, #16
 | |
| 
 | |
|     swp             v14.D[0], v14.D[1]
 | |
|     ushR            v6.4s, v6.4s, #16
 | |
| 
 | |
|     sMLAL           v4.4s, v19.4h, v10.4h
 | |
|     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
 | |
|     sMLAL           v6.4s, v17.4h, v10.4h
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     ADD             v24.4s, v20.4s , v4.4s
 | |
| 
 | |
|     rev64           v24.4s, v24.4s
 | |
|     NEG             v16.4s, v6.4s
 | |
| 
 | |
|     LD4             {v4.4h, v5.4h, v6.4h, v7.4h}, [x1], #32
 | |
| 
 | |
|     swp             v24.D[0], v24.D[1]
 | |
|     ADD             v16.4s, v22.4s , v16.4s
 | |
| 
 | |
|     SUBS            x3, x3, #1
 | |
| 
 | |
|     BNE             CORE_LOOP
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| NEON_EPILOGUE:
 | |
|     uMULL           v30.4s, v2.4h, v13.4h
 | |
|     MOV             v25.16B, v24.16B
 | |
|     ST2             { v25.4s, v26.4s}, [x7], x8
 | |
|     uMULL           v28.4s, v0.4h, v13.4h
 | |
| 
 | |
|     uMULL           v26.4s, v2.4h, v12.4h
 | |
|     MOV             v15.16B, v14.16B
 | |
|     ST2             { v15.4s, v16.4s}, [x0], #32
 | |
|     uMULL           v24.4s, v0.4h, v12.4h
 | |
| 
 | |
| 
 | |
| 
 | |
|     ushR            v30.4s, v30.4s, #16
 | |
|     ushR            v28.4s, v28.4s, #16
 | |
|     ushR            v26.4s, v26.4s, #16
 | |
|     ushR            v24.4s, v24.4s, #16
 | |
| 
 | |
|     sMLAL           v30.4s, v3.4h, v13.4h
 | |
|     sMLAL           v28.4s, v1.4h, v13.4h
 | |
|     sMLAL           v26.4s, v3.4h, v12.4h
 | |
|     sMLAL           v24.4s, v1.4h, v12.4h
 | |
| 
 | |
| 
 | |
|     uMULL           v22.4s, v6.4h, v9.4h
 | |
|     uMULL           v20.4s, v4.4h, v9.4h
 | |
| 
 | |
| 
 | |
|     ADD             v28.4s, v28.4s , v26.4s
 | |
|     SUB             v30.4s, v30.4s , v24.4s
 | |
|     NEG             v28.4s, v28.4s
 | |
| 
 | |
|     uMULL           v18.4s, v6.4h, v8.4h
 | |
|     uMULL           v16.4s, v4.4h, v8.4h
 | |
| 
 | |
| 
 | |
|     mov             v26.8b, v30.8b
 | |
|     mov             v27.D[0], v30.D[1]
 | |
|     ushR            v22.4s, v22.4s, #16
 | |
| 
 | |
|     mov             v24.16b, v28.16b
 | |
|     mov             v25.D[0], v28.D[1]
 | |
|     ushR            v20.4s, v20.4s, #16
 | |
| 
 | |
| 
 | |
|     mov             v31.8b, v26.8b
 | |
|     UZP1            v26.4h, v31.4h, v27.4h
 | |
|     UZP2            v27.4h, v31.4h, v27.4h
 | |
|     ushR            v18.4s, v18.4s, #16
 | |
| 
 | |
| 
 | |
|     mov             v31.8b, v24.8b
 | |
|     UZP1            v24.4h, v31.4h, v25.4h
 | |
|     UZP2            v25.4h, v31.4h, v25.4h
 | |
|     ushR            v16.4s, v16.4s, #16
 | |
| 
 | |
| 
 | |
|     sMLAL           v22.4s, v7.4h, v9.4h
 | |
|     sMLAL           v20.4s, v5.4h, v9.4h
 | |
|     sMLAL           v18.4s, v7.4h, v8.4h
 | |
|     sMLAL           v16.4s, v5.4h, v8.4h
 | |
| 
 | |
| 
 | |
|     uMULL           v0.4s, v26.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     uMULL           v2.4s, v24.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     ADD             v22.4s, v22.4s , v16.4s
 | |
| 
 | |
| 
 | |
|     SUB             v20.4s, v18.4s , v20.4s
 | |
| 
 | |
| 
 | |
|     NEG             v22.4s, v22.4s
 | |
| 
 | |
| 
 | |
|     mov             v18.16b, v22.16b
 | |
|     ushR            v0.4s, v0.4s, #16
 | |
| 
 | |
|     mov             v16.16b, v20.16b
 | |
|     ushR            v2.4s, v2.4s, #16
 | |
| 
 | |
| 
 | |
|     mov             v31.16b, v18.16b
 | |
|     mov             v19.d[0], v31.d[1]
 | |
|     UZP1            v18.4h, v31.4h, v19.4h
 | |
|     UZP2            v19.4h, v31.4h, v19.4h
 | |
|     sMLAL           v0.4s, v27.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     mov             v31.16b, v16.16b
 | |
|     mov             v17.d[0], v31.d[1]
 | |
|     UZP1            v16.4h, v31.4h, v17.4h
 | |
|     UZP2            v17.4h, v31.4h, v17.4h
 | |
|     sMLAL           v2.4s, v25.4h, v10.4h
 | |
| 
 | |
|     uMULL           v4.4s, v18.4h, v10.4h
 | |
|     uMULL           v6.4s, v16.4h, v10.4h
 | |
| 
 | |
|     NEG             v0.4s, v0.4s
 | |
|     ADD             v14.4s, v30.4s , v2.4s
 | |
|     ADD             v26.4s, v28.4s , v0.4s
 | |
| 
 | |
|     rev64           v14.4s, v14.4s
 | |
|     ushR            v4.4s, v4.4s, #16
 | |
| 
 | |
|     swp             v14.D[0], v14.D[1]
 | |
|     ushR            v6.4s, v6.4s, #16
 | |
| 
 | |
|     sMLAL           v4.4s, v19.4h, v10.4h
 | |
| 
 | |
|     sMLAL           v6.4s, v17.4h, v10.4h
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     ADD             v24.4s, v20.4s , v4.4s
 | |
| 
 | |
|     rev64           v24.4s, v24.4s
 | |
|     NEG             v16.4s, v6.4s
 | |
| 
 | |
| 
 | |
| 
 | |
|     swp             v24.D[0], v24.D[1]
 | |
|     ADD             v16.4s, v22.4s , v16.4s
 | |
| 
 | |
|     MOV             v25.16B, v24.16B
 | |
|     MOV             v15.16B, v14.16B
 | |
|     ST2             { v15.4s, v16.4s}, [x0], #32
 | |
|     ST2             { v25.4s, v26.4s}, [x7], x8
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     LD4             {v0.4h, v1.4h, v2.4h, v3.4h}, [x5], x8
 | |
| 
 | |
|     movi            v6.2s, #0x00000000
 | |
|     movi            v7.2s, #0x00000000
 | |
| 
 | |
|     LD2             {v4.2s, v5.2s}, [x1], #16
 | |
|     LD2             {v6.s, v7.s}[0], [x1]
 | |
| 
 | |
|     LD2             {v8.h, v9.h}[0], [x2], x6
 | |
|     LD2             {v8.h, v9.h}[1], [x2], x6
 | |
|     LD2             {v8.h, v9.h}[2], [x2], x6
 | |
|     LD2             {v8.h, v9.h}[3], [x2], x6
 | |
| 
 | |
|     rev64           v12.8h, v8.8h
 | |
|     rev64           v13.8h, v9.8h
 | |
|     swp             v5.D[0], v6.D[0]
 | |
| 
 | |
| 
 | |
|     MOV             v30.8B, V4.8B
 | |
|     UZP1            v4.4h, v30.4h, v5.4h
 | |
|     UZP2            v5.4h, v30.4h, v5.4h
 | |
|     MOV             v30.8B, V6.8B
 | |
|     UZP1            v6.4h, v30.4h, v7.4h
 | |
|     UZP2            v7.4h, v30.4h, v7.4h
 | |
|     uMULL           v30.4s, v2.4h, v13.4h
 | |
|     uMULL           v28.4s, v0.4h, v13.4h
 | |
| 
 | |
|     uMULL           v26.4s, v2.4h, v12.4h
 | |
|     uMULL           v24.4s, v0.4h, v12.4h
 | |
| 
 | |
|     ushR            v30.4s, v30.4s, #16
 | |
|     ushR            v28.4s, v28.4s, #16
 | |
|     ushR            v26.4s, v26.4s, #16
 | |
|     ushR            v24.4s, v24.4s, #16
 | |
| 
 | |
|     sMLAL           v30.4s, v3.4h, v13.4h
 | |
|     sMLAL           v28.4s, v1.4h, v13.4h
 | |
|     sMLAL           v26.4s, v3.4h, v12.4h
 | |
|     sMLAL           v24.4s, v1.4h, v12.4h
 | |
| 
 | |
|     uMULL           v22.4s, v6.4h, v9.4h
 | |
|     uMULL           v20.4s, v4.4h, v9.4h
 | |
| 
 | |
| 
 | |
|     ADD             v28.4s, v28.4s , v26.4s
 | |
|     SUB             v30.4s, v30.4s , v24.4s
 | |
|     NEG             v28.4s, v28.4s
 | |
| 
 | |
|     uMULL           v18.4s, v6.4h, v8.4h
 | |
|     uMULL           v16.4s, v4.4h, v8.4h
 | |
| 
 | |
|     mov             v26.8b, v30.8b
 | |
|     mov             v27.D[0], v30.D[1]
 | |
|     ushR            v22.4s, v22.4s, #16
 | |
| 
 | |
|     mov             v24.16b, v28.16b
 | |
|     mov             v25.D[0], v28.D[1]
 | |
|     ushR            v20.4s, v20.4s, #16
 | |
| 
 | |
| 
 | |
|     MOV             v31.8B, V26.8B
 | |
|     UZP1            v26.4h, v31.4h, v27.4h
 | |
|     UZP2            v27.4h, v31.4h, v27.4h
 | |
|     ushr            v18.4s, v18.4s, #16
 | |
| 
 | |
|     MOV             v31.8B, V24.8B
 | |
|     UZP1            v24.4h, v31.4h, v25.4h
 | |
|     UZP2            v25.4h, v31.4h, v25.4h
 | |
|     ushR            v16.4s, v16.4s, #16
 | |
| 
 | |
|     sMLAL           v22.4s, v7.4h, v9.4h
 | |
|     sMLAL           v20.4s, v5.4h, v9.4h
 | |
|     sMLAL           v18.4s, v7.4h, v8.4h
 | |
|     sMLAL           v16.4s, v5.4h, v8.4h
 | |
| 
 | |
| 
 | |
|     uMULL           v0.4s, v26.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     uMULL           v2.4s, v24.4h, v10.4h
 | |
| 
 | |
|     ADD             v22.4s, v22.4s , v16.4s
 | |
| 
 | |
| 
 | |
|     SUB             v20.4s, v18.4s , v20.4s
 | |
| 
 | |
| 
 | |
|     NEG             v22.4s, v22.4s
 | |
| 
 | |
| 
 | |
|     mov             v18.8B, v22.8B
 | |
|     mov             v19.D[0], v22.D[1]
 | |
|     ushR            v0.4s, v0.4s, #16
 | |
| 
 | |
|     mov             v16.16b, v20.16b
 | |
|     mov             v17.D[0], v20.D[1]
 | |
|     ushR            v2.4s, v2.4s, #16
 | |
| 
 | |
| 
 | |
|     MOV             v31.8B, V18.8B
 | |
|     UZP1            v18.4h, v31.4h, v19.4h
 | |
|     UZP2            v19.4h, v31.4h, v19.4h
 | |
|     sMLAL           v0.4s, v27.4h, v10.4h
 | |
| 
 | |
| 
 | |
|     MOV             v31.8B, V16.8B
 | |
|     UZP1            v16.4h, v31.4h, v17.4h
 | |
|     UZP2            v17.4h, v31.4h, v17.4h
 | |
|     sMLAL           v2.4s, v25.4h, v10.4h
 | |
| 
 | |
|     uMULL           v4.4s, v18.4h, v10.4h
 | |
|     uMULL           v6.4s, v16.4h, v10.4h
 | |
| 
 | |
|     NEG             v0.4s, v0.4s
 | |
|     ADD             v14.4s, v30.4s , v2.4s
 | |
|     ADD             v26.4s, v28.4s , v0.4s
 | |
| 
 | |
|     rev64           v14.4s, v14.4s
 | |
|     ushR            v4.4s, v4.4s, #16
 | |
| 
 | |
|     swp             v14.D[0], v14.D[1]
 | |
|     ushR            v6.4s, v6.4s, #16
 | |
| 
 | |
|     sMLAL           v4.4s, v19.4h, v10.4h
 | |
| 
 | |
|     sMLAL           v6.4s, v17.4h, v10.4h
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     ADD             v24.4s, v20.4s , v4.4s
 | |
| 
 | |
|     rev64           v24.4s, v24.4s
 | |
|     NEG             v16.4s, v6.4s
 | |
| 
 | |
|     swp             v24.D[0], v24.D[1]
 | |
|     ADD             v16.4s, v22.4s , v16.4s
 | |
| 
 | |
| 
 | |
|     MOV             v15.16B, v14.16B
 | |
|     ST2             {v15.2s, v16.2s}, [x0], #16
 | |
| 
 | |
|     ST2             {v15.s, v16.s}[2], [x0], #8
 | |
| 
 | |
|     ST1             {v15.s}[3], [x0]
 | |
| 
 | |
|     ADD             x7, x7, #4
 | |
| 
 | |
|     ST1             {v26.s}[0], [x7], #4
 | |
|     MOV             v25.16B, v24.16B
 | |
|     ST2             {v25.s, v26.s}[1], [x7], #8
 | |
|     MOV             v27.D[0], V26.d[1]
 | |
|     mov             v26.d[0], v25.d[1]
 | |
|     ST2             {v26.2s, v27.2s}, [x7]
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
|     pop_v_regs
 | |
|     ret
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 | |
| 
 |