1248 lines
45 KiB
ArmAsm
1248 lines
45 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
///**
|
|
// *******************************************************************************
|
|
// * @file
|
|
// * impeg2_idct.s
|
|
// *
|
|
// * @brief
|
|
// * contains function definitions for single stage inverse transform
|
|
// *
|
|
// * @author
|
|
// * anand s
|
|
// *
|
|
// * @par list of functions:
|
|
// * - impeg2_idct_recon_dc_av8()
|
|
// *
|
|
// * @remarks
|
|
// * none
|
|
// *
|
|
// *******************************************************************************
|
|
//*/
|
|
|
|
///**
|
|
// *******************************************************************************
|
|
// *
|
|
// * @brief
|
|
// * this function performs inverse transform and reconstruction for 8x8
|
|
// * input block
|
|
// *
|
|
// * @par description:
|
|
// * performs inverse transform and adds the prediction data and clips output
|
|
// * to 8 bit
|
|
// *
|
|
// * @param[in] pi2_src
|
|
// * input 8x8 coefficients
|
|
// *
|
|
// * @param[in] pi2_tmp
|
|
// * temporary 8x8 buffer for storing inverse
|
|
// *
|
|
// * transform
|
|
// * 1st stage output
|
|
// *
|
|
// * @param[in] pu1_pred
|
|
// * prediction 8x8 block
|
|
// *
|
|
// * @param[out] pu1_dst
|
|
// * output 8x8 block
|
|
// *
|
|
// * @param[in] src_strd
|
|
// * input stride
|
|
// *
|
|
// * @param[in] pred_strd
|
|
// * prediction stride
|
|
// *
|
|
// * @param[in] dst_strd
|
|
// * output stride
|
|
// *
|
|
// * @param[in] shift
|
|
// * output shift
|
|
// *
|
|
// * @param[in] zero_cols
|
|
// * zero columns in pi2_src
|
|
// *
|
|
// * @returns void
|
|
// *
|
|
// * @remarks
|
|
// * none
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
|
|
//void impeg2_itrans_recon_8x8(word16 *pi2_src,
|
|
// word16 *pi2_tmp,
|
|
// uword8 *pu1_pred,
|
|
// uword8 *pu1_dst,
|
|
// word32 src_strd,
|
|
// word32 pred_strd,
|
|
// word32 dst_strd,
|
|
// word32 zero_cols
|
|
// word32 zero_rows )
|
|
|
|
//**************variables vs registers*************************
|
|
// x0 => *pi2_src
|
|
// x1 => *pi2_tmp
|
|
// x2 => *pu1_pred
|
|
// x3 => *pu1_dst
|
|
// src_strd
|
|
// pred_strd
|
|
// dst_strd
|
|
// zero_cols
|
|
|
|
|
|
|
|
.text
|
|
.align 4
|
|
.include "impeg2_neon_macros.s"
|
|
|
|
.set idct_stg1_shift , 12
|
|
.set idct_stg2_shift , 16
|
|
.set idct_stg1_round , (1 << (idct_stg1_shift - 1))
|
|
.set idct_stg2_round , (1 << (idct_stg2_shift - 1))
|
|
|
|
.extern gai2_impeg2_idct_q15
|
|
.extern gai2_impeg2_idct_q11
|
|
.extern gai2_impeg2_idct_first_col_q15
|
|
.extern gai2_impeg2_idct_first_col_q11
|
|
.extern gai2_impeg2_mismatch_stg2_additive
|
|
|
|
.global impeg2_idct_recon_dc_av8
|
|
impeg2_idct_recon_dc_av8:
|
|
// STMFD sp!,{x4,x6,x12,x14}
|
|
push_v_regs
|
|
////x0: pi2_src
|
|
////x1: pi2_tmp - not used, used as pred_strd
|
|
////x2: pu1_pred
|
|
////x3: pu1_dst
|
|
////x4: used as scratch
|
|
////x5: pred_strd
|
|
////x6: dst_strd
|
|
|
|
ldrsh x4, [x0]
|
|
adrp x14, :got:gai2_impeg2_idct_q15
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
|
|
ldrsh x12, [x14]
|
|
|
|
ld1 {v0.8b}, [x2], x5
|
|
mul x4, x4, x12
|
|
|
|
ld1 {v1.8b}, [x2], x5
|
|
add x4, x4, #idct_stg1_round
|
|
|
|
ld1 {v2.8b}, [x2], x5
|
|
asr x4, x4, #idct_stg1_shift
|
|
|
|
adrp x14, :got:gai2_impeg2_idct_q11
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
|
|
ldrsh x12, [x14]
|
|
|
|
ld1 {v3.8b}, [x2], x5
|
|
mul x4, x4, x12
|
|
|
|
ld1 {v4.8b}, [x2], x5
|
|
add x4, x4, #idct_stg2_round
|
|
|
|
ld1 {v5.8b}, [x2], x5
|
|
asr x4, x4, #idct_stg2_shift
|
|
|
|
ld1 {v6.8b}, [x2], x5
|
|
dup v30.8h, w4
|
|
|
|
|
|
ld1 {v7.8b}, [x2], x5
|
|
|
|
uaddw v8.8h, v30.8h , v0.8b
|
|
|
|
uaddw v10.8h, v30.8h , v1.8b
|
|
sqxtun v0.8b, v8.8h
|
|
|
|
uaddw v12.8h, v30.8h , v2.8b
|
|
sqxtun v1.8b, v10.8h
|
|
st1 {v0.8b}, [x3], x6
|
|
|
|
uaddw v14.8h, v30.8h , v3.8b
|
|
sqxtun v2.8b, v12.8h
|
|
st1 {v1.8b}, [x3], x6
|
|
|
|
uaddw v16.8h, v30.8h , v4.8b
|
|
sqxtun v3.8b, v14.8h
|
|
st1 {v2.8b}, [x3], x6
|
|
|
|
uaddw v18.8h, v30.8h , v5.8b
|
|
sqxtun v4.8b, v16.8h
|
|
st1 {v3.8b}, [x3], x6
|
|
|
|
uaddw v20.8h, v30.8h , v6.8b
|
|
sqxtun v5.8b, v18.8h
|
|
st1 {v4.8b}, [x3], x6
|
|
|
|
uaddw v22.8h, v30.8h , v7.8b
|
|
sqxtun v6.8b, v20.8h
|
|
st1 {v5.8b}, [x3], x6
|
|
|
|
sqxtun v7.8b, v22.8h
|
|
st1 {v6.8b}, [x3], x6
|
|
|
|
|
|
st1 {v7.8b}, [x3], x6
|
|
|
|
// LDMFD sp!,{x4,x6,x12,pc}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
.global impeg2_idct_recon_dc_mismatch_av8
|
|
.extern gai2_impeg2_idct_last_row_q11
|
|
.extern gai2_impeg2_mismatch_stg1_outp
|
|
impeg2_idct_recon_dc_mismatch_av8:
|
|
// STMFD sp!,{x4-x12,x14}
|
|
push_v_regs
|
|
|
|
ldrsh x4, [x0]
|
|
adrp x14, :got:gai2_impeg2_idct_q15
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
|
|
ldrsh x12, [x14]
|
|
|
|
mul x4, x4, x12
|
|
add x4, x4, #idct_stg1_round
|
|
asr x4, x4, #idct_stg1_shift
|
|
|
|
adrp x14, :got:gai2_impeg2_idct_q11
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
|
|
ldrsh x12, [x14]
|
|
mul x4, x4, x12
|
|
dup v0.4s, w4
|
|
|
|
mov x14, #16 ////Increment for table read
|
|
adrp x4, :got:gai2_impeg2_mismatch_stg2_additive
|
|
ldr x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive]
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
ld1 {v2.4h, v3.4h}, [x4], x14
|
|
ld1 {v30.8b}, [x2], x5
|
|
sxtl v8.4s, v2.4h
|
|
sxtl v10.4s, v3.4h
|
|
raddhn v12.4h, v0.4s, v8.4s
|
|
raddhn2 v12.8h, v0.4s, v10.4s
|
|
uaddw v14.8h, v12.8h , v30.8b
|
|
sqxtun v30.8b, v14.8h
|
|
st1 {v30.8b}, [x3], x6
|
|
|
|
|
|
// LDMFD sp!,{x4-x12,pc}
|
|
pop_v_regs
|
|
ret
|
|
|
|
.globl impeg2_idct_recon_av8
|
|
|
|
.type impeg2_idct_recon_av8, %function
|
|
|
|
impeg2_idct_recon_av8:
|
|
////register usage.extern - loading and until idct of columns
|
|
//// cosine constants - d0
|
|
//// sine constants - d1
|
|
//// row 0 first half - d2 - y0
|
|
//// row 1 first half - d6 - y1
|
|
//// row 2 first half - d3 - y2
|
|
//// row 3 first half - d7 - y3
|
|
//// row 4 first half - d10 - y4
|
|
//// row 5 first half - d14 - y5
|
|
//// row 6 first half - d11 - y6
|
|
//// row 7 first half - d15 - y7
|
|
|
|
//// row 0 second half - d4 - y0
|
|
//// row 1 second half - d8 - y1
|
|
//// row 2 second half - d5 - y2
|
|
//// row 3 second half - d9 - y3
|
|
//// row 4 second half - d12 - y4
|
|
//// row 5 second half - d16 - y5
|
|
//// row 6 second half - d13 - y6
|
|
//// row 7 second half - d17 - y7
|
|
|
|
//// copy the input pointer to another register
|
|
//// step 1 : load all constants
|
|
// stmfd sp!,{x4-x12,x14}
|
|
|
|
ldr w11, [sp] // zero rows
|
|
|
|
push_v_regs
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
mov x12, x7 // zero columns
|
|
mov x8, x5 // prediction stride
|
|
mov x7, x6 // destination stride
|
|
mov x6, x4 // src stride
|
|
lsl x6, x6, #1 // x sizeof(word16)
|
|
add x9, x0, x6, lsl #1 // 2 rows
|
|
|
|
add x10, x6, x6, lsl #1 // 3 rows
|
|
|
|
sub x10, x10, #8 // - 4 cols * sizeof(word16)
|
|
sub x5, x6, #8 // src_strd - 4 cols * sizeof(word16)
|
|
|
|
adrp x14, :got:gai2_impeg2_idct_first_col_q15
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
|
|
ld1 {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data
|
|
|
|
////step 2 load all the input data
|
|
////step 3 operate first 4 colums at a time
|
|
|
|
and x11, x11, #0xff
|
|
and x12, x12, #0xff
|
|
|
|
cmp x11, #0xf0
|
|
bge skip_last4_rows
|
|
|
|
|
|
ld1 {v2.4h}, [x0], #8
|
|
ld1 {v3.4h}, [x9], #8
|
|
ld1 {v4.4h}, [x0], x5
|
|
smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
ld1 {v5.4h}, [x9], x5
|
|
smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
|
|
ld1 {v6.4h}, [x0], #8
|
|
ld1 {v7.4h}, [x9], #8
|
|
smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
ld1 {v8.4h}, [x0], x10
|
|
smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
ld1 {v9.4h}, [x9], x10
|
|
smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
ld1 {v10.4h}, [x0], #8
|
|
smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
ld1 {v11.4h}, [x9], #8
|
|
smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
ld1 {v12.4h}, [x0], x5
|
|
smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
ld1 {v13.4h}, [x9], x5
|
|
smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
ld1 {v14.4h}, [x0], #8
|
|
smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
ld1 {v15.4h}, [x9], #8
|
|
smull v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
|
|
ld1 {v16.4h}, [x0], x10
|
|
smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
ld1 {v17.4h}, [x9], x10
|
|
|
|
///* this following was activated when alignment is not there */
|
|
//// vld1.16 d2,[x0]!
|
|
//// vld1.16 d3,[x2]!
|
|
//// vld1.16 d4,[x0]!
|
|
//// vld1.16 d5,[x2]!
|
|
//// vld1.16 d6,[x0]!
|
|
//// vld1.16 d7,[x2]!
|
|
//// vld1.16 d8,[x0],x3
|
|
//// vld1.16 d9,[x2],x3
|
|
//// vld1.16 d10,[x0]!
|
|
//// vld1.16 d11,[x2]!
|
|
//// vld1.16 d12,[x0]!
|
|
//// vld1.16 d13,[x2]!
|
|
//// vld1.16 d14,[x0]!
|
|
//// vld1.16 d15,[x2]!
|
|
//// vld1.16 d16,[x0],x3
|
|
//// vld1.16 d17,[x2],x3
|
|
|
|
|
|
|
|
|
|
smlal v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
|
|
smlsl v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
|
|
smlal v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
|
|
smlal v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
|
|
|
|
smlsl v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
|
|
smlal v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
|
|
|
|
add v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
|
|
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
|
|
|
|
smlal v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
|
|
smlsl v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
|
|
smlal v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
|
|
smlsl v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
|
|
|
|
add v14.4s, v10.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
|
|
sub v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
|
|
|
|
add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
|
|
sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
|
|
|
|
add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
|
|
sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
|
|
|
|
add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
|
|
sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
|
|
|
|
add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
|
|
sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
|
|
|
|
sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
|
|
|
|
b last4_cols
|
|
|
|
|
|
|
|
skip_last4_rows:
|
|
adrp x14, :got:gai2_impeg2_idct_first_col_q15
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
|
|
ld1 {v0.4h, v1.4h}, [x14]
|
|
|
|
ld1 {v2.4h}, [x0], #8
|
|
ld1 {v3.4h}, [x9], #8
|
|
ld1 {v4.4h}, [x0], x5
|
|
ld1 {v5.4h}, [x9], x5
|
|
ld1 {v6.4h}, [x0], #8
|
|
ld1 {v7.4h}, [x9], #8
|
|
ld1 {v8.4h}, [x0], x10
|
|
ld1 {v9.4h}, [x9], x10
|
|
|
|
|
|
|
|
movi v12.4h, #0
|
|
movi v13.4h, #0
|
|
movi v16.4h, #0
|
|
movi v17.4h, #0
|
|
|
|
|
|
|
|
|
|
smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
|
|
smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
|
|
smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
|
|
|
|
add v14.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
|
|
sub v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
|
|
|
|
add v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
|
|
sub v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
|
|
|
|
add v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
|
|
sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
|
|
|
|
add v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
|
|
sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
|
|
|
|
add v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
|
|
sub v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
|
|
|
|
sqrshrn v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
|
|
|
|
last4_cols:
|
|
adrp x14, :got:gai2_impeg2_idct_first_col_q15
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
|
|
ld1 {v0.4h, v1.4h}, [x14]
|
|
|
|
|
|
cmp x12, #0xf0
|
|
bge skip_last4cols
|
|
|
|
smull v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
smull v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
|
|
smull v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
|
|
smull v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
|
|
|
|
smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
|
|
smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
|
|
smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
|
|
smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
|
|
|
|
smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
|
|
smlal v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
|
|
|
|
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
|
|
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
|
|
|
|
smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
|
|
smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
|
|
smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
|
|
smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
|
|
|
|
add v16.4s, v12.4s , v8.4s //// a0 = c0 + d0(part of e0,e7)
|
|
sub v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)
|
|
|
|
add v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
|
|
sub v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)
|
|
|
|
add v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
|
|
sub v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)
|
|
|
|
add v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
|
|
sub v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)
|
|
|
|
add v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
|
|
sub v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)
|
|
|
|
sqrshrn v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
sqrshrn v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
|
|
b end_skip_last4cols
|
|
|
|
|
|
|
|
skip_last4cols:
|
|
adrp x14, :got:gai2_impeg2_idct_first_col_q11
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
|
|
ld1 {v0.4h, v1.4h}, [x14]
|
|
|
|
umov x15, v25.d[0]
|
|
|
|
trn1 v25.4h, v2.4h, v6.4h
|
|
trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
|
|
|
|
trn1 v27.4h, v3.4h, v7.4h
|
|
trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
|
|
|
|
trn1 v6.2s, v29.2s, v31.2s
|
|
trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
|
|
trn1 v2.2s, v25.2s, v27.2s
|
|
trn2 v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
|
|
|
|
|
|
trn1 v25.4h, v10.4h, v14.4h
|
|
trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
|
|
|
|
trn1 v27.4h, v11.4h, v15.4h
|
|
trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
|
|
|
|
trn1 v10.2s, v25.2s, v27.2s
|
|
trn2 v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
|
|
trn1 v14.2s, v29.2s, v31.2s
|
|
trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
|
|
|
|
mov v25.d[0], x15
|
|
|
|
smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
// vmull.s16 q11,d4,d0[0] @// y4 * cos4(part of c0 and c1)
|
|
|
|
smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
|
|
smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
|
|
|
|
|
|
|
|
sub v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
|
|
add v4.4s, v20.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
|
|
|
|
|
|
add v2.4s, v4.4s , v24.4s
|
|
|
|
sub v6.4s, v4.4s , v24.4s
|
|
|
|
add v8.4s, v22.4s , v30.4s
|
|
|
|
sub v24.4s, v22.4s , v30.4s
|
|
|
|
sqrshrn v5.4h, v8.4s, #idct_stg2_shift
|
|
sqrshrn v2.4h, v2.4s, #idct_stg2_shift
|
|
sqrshrn v9.4h, v6.4s, #idct_stg2_shift
|
|
sqrshrn v6.4h, v24.4s, #idct_stg2_shift
|
|
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
|
|
|
|
|
|
add v30.4s, v22.4s , v28.4s
|
|
|
|
sub v24.4s, v22.4s , v28.4s
|
|
|
|
add v28.4s, v18.4s , v26.4s
|
|
|
|
sub v22.4s, v18.4s , v26.4s
|
|
sqrshrn v4.4h, v30.4s, #idct_stg2_shift
|
|
sqrshrn v7.4h, v24.4s, #idct_stg2_shift
|
|
sqrshrn v3.4h, v28.4s, #idct_stg2_shift
|
|
sqrshrn v8.4h, v22.4s, #idct_stg2_shift
|
|
|
|
|
|
|
|
umov x19, v25.d[0]
|
|
umov x20, v25.d[1]
|
|
|
|
trn1 v27.4h, v2.4h, v3.4h
|
|
trn2 v29.4h, v2.4h, v3.4h
|
|
trn1 v25.4h, v4.4h, v5.4h
|
|
trn2 v31.4h, v4.4h, v5.4h
|
|
|
|
trn1 v2.2s, v27.2s, v25.2s
|
|
trn2 v4.2s, v27.2s, v25.2s
|
|
trn1 v3.2s, v29.2s, v31.2s
|
|
trn2 v5.2s, v29.2s, v31.2s
|
|
|
|
trn1 v27.4h, v6.4h, v7.4h
|
|
trn2 v29.4h, v6.4h, v7.4h
|
|
trn1 v25.4h, v8.4h, v9.4h
|
|
trn2 v31.4h, v8.4h, v9.4h
|
|
|
|
trn1 v6.2s, v27.2s, v25.2s
|
|
trn2 v8.2s, v27.2s, v25.2s
|
|
trn1 v7.2s, v29.2s, v31.2s
|
|
trn2 v9.2s, v29.2s, v31.2s
|
|
|
|
mov v25.d[0], x19
|
|
mov v25.d[1], x20
|
|
|
|
smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
|
|
smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
|
|
smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
|
|
|
|
add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
|
|
|
|
|
|
add x5, x8, x8, lsl #1 //
|
|
|
|
|
|
add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data
|
|
|
|
|
|
add x10, x7, x7, lsl #1 //
|
|
|
|
// swapping v3 and v6
|
|
mov v31.d[0], v3.d[0]
|
|
mov v3.d[0], v6.d[0]
|
|
mov v6.d[0], v31.d[0]
|
|
|
|
// swapping v5 and v8
|
|
mov v31.d[0], v5.d[0]
|
|
mov v5.d[0], v8.d[0]
|
|
mov v8.d[0], v31.d[0]
|
|
|
|
|
|
sub v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
|
|
add v12.4s, v20.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
|
|
|
|
|
|
add v0.4s, v12.4s , v24.4s
|
|
|
|
|
|
sub v24.4s, v12.4s , v24.4s
|
|
|
|
|
|
add v12.4s, v22.4s , v30.4s
|
|
|
|
|
|
sub v14.4s, v22.4s , v30.4s
|
|
|
|
sqrshrn v10.4h, v0.4s, #idct_stg2_shift
|
|
sqrshrn v17.4h, v24.4s, #idct_stg2_shift
|
|
sqrshrn v13.4h, v12.4s, #idct_stg2_shift
|
|
sqrshrn v14.4h, v14.4s, #idct_stg2_shift
|
|
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
|
|
|
|
|
|
add v0.4s, v22.4s , v28.4s
|
|
|
|
|
|
sub v24.4s, v22.4s , v28.4s
|
|
|
|
|
|
add v28.4s, v18.4s , v26.4s
|
|
|
|
|
|
sub v26.4s, v18.4s , v26.4s
|
|
ld1 {v18.8b}, [x2], x8
|
|
|
|
sqrshrn v12.4h, v0.4s, #idct_stg2_shift
|
|
ld1 {v20.8b}, [x2], x5
|
|
|
|
|
|
sqrshrn v15.4h, v24.4s, #idct_stg2_shift
|
|
ld1 {v19.8b}, [x2], x8
|
|
|
|
|
|
|
|
|
|
sqrshrn v11.4h, v28.4s, #idct_stg2_shift
|
|
ld1 {v22.8b}, [x4], x8
|
|
|
|
|
|
|
|
|
|
sqrshrn v16.4h, v26.4s, #idct_stg2_shift
|
|
ld1 {v21.8b}, [x2], x5
|
|
|
|
|
|
b pred_buff_addition
|
|
end_skip_last4cols:
|
|
adrp x14, :got:gai2_impeg2_idct_first_col_q11
|
|
ldr x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
|
|
ld1 {v0.4h, v1.4h}, [x14]
|
|
|
|
|
|
umov x19, v25.d[0]
|
|
umov x20, v25.d[1]
|
|
|
|
///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
|
|
trn1 v27.4h, v2.4h, v6.4h
|
|
trn2 v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
|
|
trn1 v25.4h, v3.4h, v7.4h
|
|
trn2 v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
|
|
|
|
trn1 v2.2s, v27.2s, v25.2s
|
|
trn2 v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
|
|
trn1 v6.2s, v29.2s, v31.2s
|
|
trn2 v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
|
|
|
|
trn1 v27.4h, v4.4h, v8.4h
|
|
trn2 v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
|
|
trn1 v25.4h, v5.4h, v9.4h
|
|
trn2 v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing
|
|
|
|
trn1 v4.2s, v27.2s, v25.2s
|
|
trn2 v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
|
|
trn1 v8.2s, v29.2s, v31.2s
|
|
trn2 v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
|
|
|
|
trn1 v27.4h, v10.4h, v14.4h
|
|
trn2 v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
|
|
trn1 v25.4h, v11.4h, v15.4h
|
|
trn2 v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
|
|
|
|
trn1 v10.2s, v27.2s, v25.2s
|
|
trn2 v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
|
|
trn1 v14.2s, v29.2s, v31.2s
|
|
trn2 v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
|
|
|
|
trn1 v27.4h, v12.4h, v16.4h
|
|
trn2 v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
|
|
trn1 v25.4h, v13.4h, v17.4h
|
|
trn2 v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
|
|
|
|
trn1 v12.2s, v27.2s, v25.2s
|
|
trn2 v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
|
|
trn1 v16.2s, v29.2s, v31.2s
|
|
trn2 v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
|
|
|
|
mov v25.d[0], x19
|
|
mov v25.d[1], x20
|
|
|
|
////step6 operate on first four rows and find their idct
|
|
////register usage.extern - storing and idct of rows
|
|
//// cosine constants - d0
|
|
//// sine constants - d1
|
|
//// element 0 first four - d2 - y0
|
|
//// element 1 first four - d6 - y1
|
|
//// element 2 first four - d3 - y2
|
|
//// element 3 first four - d7 - y3
|
|
//// element 4 first four - d4 - y4
|
|
//// element 5 first four - d8 - y5
|
|
//// element 6 first four - d5 - y6
|
|
//// element 7 first four - d9 - y7
|
|
//// element 0 second four - d10 - y0
|
|
//// element 1 second four - d14 - y1
|
|
//// element 2 second four - d11 - y2
|
|
//// element 3 second four - d15 - y3
|
|
//// element 4 second four - d12 - y4
|
|
//// element 5 second four - d16 - y5
|
|
//// element 6 second four - d13 - y6
|
|
//// element 7 second four - d17 - y7
|
|
|
|
//// map between first kernel code seq and current
|
|
//// d2 -> d2
|
|
//// d6 -> d6
|
|
//// d3 -> d3
|
|
//// d7 -> d7
|
|
//// d10 -> d4
|
|
//// d14 -> d8
|
|
//// d11 -> d5
|
|
//// d15 -> d9
|
|
//// q3 -> q3
|
|
//// q5 -> q2
|
|
//// q7 -> q4
|
|
|
|
smull v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
|
|
smlal v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
|
|
smull v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
smull v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
|
|
|
|
smull v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
|
|
smull v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
|
|
|
|
smlal v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
|
|
smlsl v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
|
|
smlal v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
|
|
smlal v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
|
|
|
|
smlsl v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
|
|
smlal v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
|
|
|
|
add v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
|
|
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
|
|
|
|
smlal v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
|
|
smlsl v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
|
|
smlal v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
|
|
smlsl v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
|
|
|
|
sub v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
|
|
add v4.4s, v2.4s , v6.4s //// a0 = c0 + d0(part of x0,x7)
|
|
|
|
|
|
add v2.4s, v4.4s , v24.4s
|
|
|
|
sub v6.4s, v4.4s , v24.4s
|
|
|
|
add v8.4s, v22.4s , v30.4s
|
|
|
|
sub v24.4s, v22.4s , v30.4s
|
|
|
|
sqrshrn v5.4h, v8.4s, #idct_stg2_shift
|
|
sqrshrn v2.4h, v2.4s, #idct_stg2_shift
|
|
sqrshrn v9.4h, v6.4s, #idct_stg2_shift
|
|
sqrshrn v6.4h, v24.4s, #idct_stg2_shift
|
|
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
|
|
|
|
|
|
add v30.4s, v22.4s , v28.4s
|
|
|
|
sub v24.4s, v22.4s , v28.4s
|
|
|
|
add v28.4s, v18.4s , v26.4s
|
|
|
|
sub v22.4s, v18.4s , v26.4s
|
|
sqrshrn v4.4h, v30.4s, #idct_stg2_shift
|
|
sqrshrn v7.4h, v24.4s, #idct_stg2_shift
|
|
sqrshrn v3.4h, v28.4s, #idct_stg2_shift
|
|
sqrshrn v8.4h, v22.4s, #idct_stg2_shift
|
|
|
|
|
|
|
|
umov x19, v25.d[0]
|
|
umov x20, v25.d[1]
|
|
|
|
trn1 v27.4h, v2.4h, v3.4h
|
|
trn2 v29.4h, v2.4h, v3.4h
|
|
trn1 v25.4h, v4.4h, v5.4h
|
|
trn2 v31.4h, v4.4h, v5.4h
|
|
|
|
trn1 v2.2s, v27.2s, v25.2s
|
|
trn2 v4.2s, v27.2s, v25.2s
|
|
trn1 v3.2s, v29.2s, v31.2s
|
|
trn2 v5.2s, v29.2s, v31.2s
|
|
|
|
trn1 v27.4h, v6.4h, v7.4h
|
|
trn2 v29.4h, v6.4h, v7.4h
|
|
trn1 v25.4h, v8.4h, v9.4h
|
|
trn2 v31.4h, v8.4h, v9.4h
|
|
|
|
trn1 v6.2s, v27.2s, v25.2s
|
|
trn2 v8.2s, v27.2s, v25.2s
|
|
trn1 v7.2s, v29.2s, v31.2s
|
|
trn2 v9.2s, v29.2s, v31.2s
|
|
|
|
mov v25.d[0], x19
|
|
mov v25.d[1], x20
|
|
|
|
|
|
|
|
smull v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
|
|
smull v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
|
|
smull v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
|
|
smull v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
|
|
smlal v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
|
|
smlsl v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
|
|
smlsl v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
|
|
smlsl v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
|
|
smull v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
|
|
smull v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
|
|
smull v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
|
|
smull v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
|
|
smlal v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
|
|
|
|
add x4, x2, x8, lsl #1 // x4 = x2 + pred_strd * 2 => x4 points to 3rd row of pred data
|
|
smlsl v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
|
|
|
|
add x5, x8, x8, lsl #1 //
|
|
smlal v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
|
|
|
|
add x0, x3, x7, lsl #1 // x0 points to 3rd row of dest data
|
|
smlal v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
|
|
|
|
add x10, x7, x7, lsl #1 //
|
|
smlsl v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
|
|
|
|
|
|
smlal v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
|
|
|
|
add v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
|
|
sub v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
|
|
|
|
smlal v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
|
|
|
|
// swapping v3 and v6
|
|
mov v31.d[0], v3.d[0]
|
|
mov v3.d[0], v6.d[0]
|
|
mov v6.d[0], v31.d[0]
|
|
|
|
smlsl v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
|
|
// swapping v5 and v8
|
|
mov v31.d[0], v5.d[0]
|
|
mov v5.d[0], v8.d[0]
|
|
mov v8.d[0], v31.d[0]
|
|
|
|
smlal v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
|
|
smlsl v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
|
|
|
|
sub v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
|
|
add v12.4s, v12.4s , v14.4s //// a0 = c0 + d0(part of x0,x7)
|
|
|
|
|
|
add v0.4s, v12.4s , v24.4s
|
|
|
|
|
|
sub v24.4s, v12.4s , v24.4s
|
|
|
|
|
|
add v12.4s, v22.4s , v30.4s
|
|
|
|
|
|
sub v14.4s, v22.4s , v30.4s
|
|
|
|
sqrshrn v10.4h, v0.4s, #idct_stg2_shift
|
|
sqrshrn v17.4h, v24.4s, #idct_stg2_shift
|
|
sqrshrn v13.4h, v12.4s, #idct_stg2_shift
|
|
sqrshrn v14.4h, v14.4s, #idct_stg2_shift
|
|
|
|
sub v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
|
|
add v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
|
|
|
|
|
|
add v0.4s, v22.4s , v28.4s
|
|
|
|
|
|
sub v24.4s, v22.4s , v28.4s
|
|
|
|
|
|
add v28.4s, v18.4s , v26.4s
|
|
|
|
|
|
sub v26.4s, v18.4s , v26.4s
|
|
ld1 {v18.8b}, [x2], x8
|
|
|
|
sqrshrn v12.4h, v0.4s, #idct_stg2_shift
|
|
ld1 {v20.8b}, [x2], x5
|
|
|
|
|
|
sqrshrn v15.4h, v24.4s, #idct_stg2_shift
|
|
ld1 {v19.8b}, [x2], x8
|
|
|
|
|
|
|
|
|
|
sqrshrn v11.4h, v28.4s, #idct_stg2_shift
|
|
ld1 {v22.8b}, [x4], x8
|
|
|
|
|
|
|
|
|
|
sqrshrn v16.4h, v26.4s, #idct_stg2_shift
|
|
ld1 {v21.8b}, [x2], x5
|
|
|
|
|
|
|
|
|
|
pred_buff_addition:
|
|
|
|
umov x19, v25.d[0]
|
|
umov x20, v25.d[1]
|
|
|
|
trn1 v27.4h, v10.4h, v11.4h
|
|
trn2 v29.4h, v10.4h, v11.4h
|
|
trn1 v25.4h, v12.4h, v13.4h
|
|
trn2 v31.4h, v12.4h, v13.4h
|
|
|
|
trn1 v10.2s, v27.2s, v25.2s
|
|
trn2 v12.2s, v27.2s, v25.2s
|
|
trn1 v11.2s, v29.2s, v31.2s
|
|
trn2 v13.2s, v29.2s, v31.2s
|
|
|
|
trn1 v27.4h, v14.4h, v15.4h
|
|
trn2 v29.4h, v14.4h, v15.4h
|
|
trn1 v25.4h, v16.4h, v17.4h
|
|
trn2 v31.4h, v16.4h, v17.4h
|
|
|
|
trn1 v14.2s, v27.2s, v25.2s
|
|
trn2 v16.2s, v27.2s, v25.2s
|
|
trn1 v15.2s, v29.2s, v31.2s
|
|
trn2 v17.2s, v29.2s, v31.2s
|
|
|
|
|
|
mov v25.d[0], x19
|
|
mov v25.d[1], x20
|
|
|
|
|
|
ld1 {v24.8b}, [x4], x5
|
|
ld1 {v23.8b}, [x4], x8
|
|
ld1 {v25.8b}, [x4], x5
|
|
mov v2.d[1], v3.d[0]
|
|
mov v4.d[1], v5.d[0]
|
|
mov v6.d[1], v7.d[0]
|
|
mov v8.d[1], v9.d[0]
|
|
uaddw v2.8h, v2.8h , v18.8b
|
|
uaddw v4.8h, v4.8h , v22.8b
|
|
uaddw v6.8h, v6.8h , v20.8b
|
|
uaddw v8.8h, v8.8h , v24.8b
|
|
|
|
// swapping v11 and v14
|
|
mov v31.d[0], v11.d[0]
|
|
mov v11.d[0], v14.d[0]
|
|
mov v14.d[0], v31.d[0]
|
|
|
|
// swapping v13 and v16
|
|
mov v31.d[0], v13.d[0]
|
|
mov v13.d[0], v16.d[0]
|
|
mov v16.d[0], v31.d[0]
|
|
// row values stored in the q register.
|
|
|
|
//q1 :x0
|
|
//q3: x1
|
|
//q2: x2
|
|
//q4: x3
|
|
//q5: x4
|
|
//q7: x5
|
|
//q6: x6
|
|
//q8: x7
|
|
|
|
|
|
|
|
///// adding the prediction buffer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// load prediction data
|
|
|
|
|
|
|
|
|
|
|
|
//adding recon with prediction
|
|
|
|
|
|
|
|
|
|
mov v10.d[1], v11.d[0]
|
|
mov v12.d[1], v13.d[0]
|
|
mov v14.d[1], v15.d[0]
|
|
mov v16.d[1], v17.d[0]
|
|
uaddw v10.8h, v10.8h , v19.8b
|
|
sqxtun v2.8b, v2.8h
|
|
uaddw v14.8h, v14.8h , v21.8b
|
|
sqxtun v4.8b, v4.8h
|
|
uaddw v12.8h, v12.8h , v23.8b
|
|
sqxtun v6.8b, v6.8h
|
|
uaddw v16.8h, v16.8h , v25.8b
|
|
sqxtun v8.8b, v8.8h
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st1 {v2.8b}, [x3], x7
|
|
sqxtun v10.8b, v10.8h
|
|
st1 {v6.8b}, [x3], x10
|
|
sqxtun v14.8b, v14.8h
|
|
st1 {v4.8b}, [x0], x7
|
|
sqxtun v12.8b, v12.8h
|
|
st1 {v8.8b}, [x0], x10
|
|
sqxtun v16.8b, v16.8h
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st1 {v10.8b}, [x3], x7
|
|
st1 {v14.8b}, [x3], x10
|
|
st1 {v12.8b}, [x0], x7
|
|
st1 {v16.8b}, [x0], x10
|
|
|
|
|
|
|
|
|
|
// ldmfd sp!,{x4-x12,pc}
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|