527 lines
14 KiB
ArmAsm
527 lines
14 KiB
ArmAsm
@/******************************************************************************
|
|
@ *
|
|
@ * Copyright (C) 2015 The Android Open Source Project
|
|
@ *
|
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
|
@ * you may not use this file except in compliance with the License.
|
|
@ * You may obtain a copy of the License at:
|
|
@ *
|
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
|
@ *
|
|
@ * Unless required by applicable law or agreed to in writing, software
|
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
@ * See the License for the specific language governing permissions and
|
|
@ * limitations under the License.
|
|
@ *
|
|
@ *****************************************************************************
|
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
@*/
|
|
|
|
|
|
.data
|
|
.p2align 2
|
|
|
|
scratch_intrapred_luma_4x4_prediction:
|
|
.long ver, hor, d_c, dia_dl
|
|
.long dia_dr, ver_r, hor_d, ver_l
|
|
.long hor_u
|
|
|
|
|
|
.text
|
|
.p2align 2
|
|
|
|
scratch_intrapred_luma_4x4_prediction_addr1:
|
|
.long scratch_intrapred_luma_4x4_prediction - scrintra_4x4 - 8
|
|
|
|
|
|
|
|
@/**
|
|
@******************************************************************************
|
|
@*
|
|
@* @brief :Evaluate best intra 4x4 mode
|
|
@* and do the prediction.
|
|
@*
|
|
@* @par Description
|
|
@* This function evaluates 4x4 modes and compute corresponding sad
|
|
@* and return the buffer predicted with best mode.
|
|
@*
|
|
@* @param[in] pu1_src
|
|
@* UWORD8 pointer to the source
|
|
@*
|
|
@** @param[in] pu1_ngbr_pels
|
|
@* UWORD8 pointer to neighbouring pels
|
|
@*
|
|
@* @param[out] pu1_dst
|
|
@* UWORD8 pointer to the destination
|
|
@*
|
|
@* @param[in] src_strd
|
|
@* integer source stride
|
|
@*
|
|
@* @param[in] dst_strd
|
|
@* integer destination stride
|
|
@*
|
|
@* @param[in] u4_n_avblty
|
|
@* availability of neighbouring pixels
|
|
@*
|
|
@* @param[in] u4_intra_mode
|
|
@* Pointer to the variable in which best mode is returned
|
|
@*
|
|
@* @param[in] pu4_sadmin
|
|
@* Pointer to the variable in which minimum cost is returned
|
|
@*
|
|
@* @param[in] u4_valid_intra_modes
|
|
@* Says what all modes are valid
|
|
@*
|
|
@* * @param[in] u4_lambda
|
|
@* Lamda value for computing cost from SAD
|
|
@*
|
|
@* @param[in] u4_predictd_mode
|
|
@* Predicted mode for cost computation
|
|
@*
|
|
@*
|
|
@*
|
|
@* @return none
|
|
@*
|
|
@******************************************************************************
|
|
@*/
|
|
@void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
|
|
@ UWORD8 *pu1_ngbr_pels,
|
|
@ UWORD8 *pu1_dst,
|
|
@ UWORD32 src_strd,
|
|
@ UWORD32 dst_strd,
|
|
@ WORD32 u4_n_avblty,
|
|
@ UWORD32 *u4_intra_mode,
|
|
@ WORD32 *pu4_sadmin,
|
|
@ UWORD32 u4_valid_intra_modes,
|
|
@ UWORD32 u4_lambda,
|
|
@ UWORD32 u4_predictd_mode)
|
|
|
|
|
|
|
|
.global ih264e_evaluate_intra_4x4_modes_a9q
|
|
|
|
ih264e_evaluate_intra_4x4_modes_a9q:
|
|
|
|
@r0 = pu1_src,
|
|
@r1 = pu1_ngbr_pels_i16,
|
|
@r2 = pu1_dst,
|
|
@r3 = src_strd,
|
|
@r4 = dst_strd,
|
|
@r5 = u4_n_avblty,
|
|
@r6 = u4_intra_mode,
|
|
@r7 = pu4_sadmin
|
|
@r8 = u4_valid_intra_modes
|
|
@r0 =u4_lambda
|
|
@r1 = u4_predictd_mode
|
|
|
|
|
|
stmfd sp!, {r4-r12, r14} @store register values to stack
|
|
|
|
@--------------------
|
|
ldr r5, [sp, #44] @r5 = u4_n_avblty,
|
|
@----------------------
|
|
vpush {d8-d15}
|
|
@Loading neighbours
|
|
vld1.32 {q0}, [r1]
|
|
add r4, r1, #12
|
|
vld1.8 d1[5], [r4]
|
|
vld1.8 d1[7], [r1]
|
|
@--------------------------------
|
|
ldr r8, [sp, #120] @u4_valid_intra_modes
|
|
@----------------------------------------------
|
|
|
|
|
|
|
|
@ LOADING pu1_src
|
|
vld1.32 {d20[0]}, [r0], r3
|
|
vext.8 q1, q0, q0, #1
|
|
vld1.32 {d20[1]}, [r0], r3
|
|
mov r11, #1
|
|
vld1.32 {d21[0]}, [r0], r3
|
|
lsl r11, r11, #30
|
|
vld1.32 {d21[1]}, [r0], r3
|
|
|
|
|
|
|
|
@--------------------------------
|
|
ldr r0, [sp, #124] @r0 =u4_lambda
|
|
ldr r1, [sp, #128] @r1 = u4_predictd_mode
|
|
@------
|
|
|
|
|
|
vert:
|
|
ands r10, r8, #01 @VERT sad ??
|
|
beq horz
|
|
vdup.32 q2, d2[1]
|
|
vabdl.u8 q14, d4, d20
|
|
vabal.u8 q14, d4, d21
|
|
vadd.i16 d28, d29, d28
|
|
subs r6, r1, #0
|
|
vpaddl.u16 d28, d28 @
|
|
lslne r6, r0, #2
|
|
vpaddl.u32 d28, d28 @/
|
|
moveq r6, r0 @
|
|
vmov.u32 r9, d28[0] @ vert
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #0
|
|
|
|
horz:
|
|
ands r10, r8, #02 @HORZ sad ??
|
|
beq dc
|
|
vdup.32 q3, d0[0]
|
|
vmov.32 q4, q3
|
|
vtrn.8 q3, q4
|
|
vtrn.16 d7, d6
|
|
vtrn.16 d9, d8
|
|
vtrn.32 d9, d7
|
|
vtrn.32 d8, d6
|
|
vabdl.u8 q14, d6, d20
|
|
subs r6, r1, #1
|
|
vabal.u8 q14, d7, d21
|
|
vadd.i16 d28, d29, d28
|
|
lslne r6, r0, #2
|
|
vpaddl.u16 d28, d28 @
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #1
|
|
|
|
dc:
|
|
ands r10, r8, #04 @DC sad ??
|
|
beq diags
|
|
vext.8 q4, q0, q0, #5
|
|
vaddl.u8 q4, d0, d8
|
|
vpaddl.u16 d8, d8 @
|
|
vpaddl.u32 d8, d8 @/
|
|
vmov.u32 r4, d8[0] @
|
|
mov r14, #1
|
|
ands r10, r5, #1
|
|
addne r4, r4, #2
|
|
addne r14, r14, #1
|
|
ands r10, r5, #4
|
|
addne r4, r4, #2
|
|
addne r14, r14, #1
|
|
ands r10, r5, #5
|
|
moveq r4, #128
|
|
moveq r14, #0
|
|
subs r6, r1, #2
|
|
lsr r4, r4, r14
|
|
vdup.8 q4, r4
|
|
lslne r6, r0, #2
|
|
vabdl.u8 q14, d8, d20
|
|
vabal.u8 q14, d9, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #2
|
|
|
|
diags:
|
|
ands r10, r8, #504 @/* if modes other than VERT, HORZ and DC are valid ????*/
|
|
beq pred
|
|
@/* Performing FILT11 and FILT121 operation for all neighbour values*/
|
|
vext.8 q5, q0, q0, #2
|
|
vaddl.u8 q6, d0, d2
|
|
vaddl.u8 q7, d1, d3
|
|
vaddl.u8 q8, d10, d2
|
|
vaddl.u8 q9, d11, d3
|
|
vadd.u16 q12, q10, q11
|
|
vqrshrun.s16 d10, q6, #1
|
|
vqrshrun.s16 d11, q7, #1
|
|
vadd.u16 q11, q6, q8
|
|
vadd.u16 q12, q7, q9
|
|
vqrshrun.s16 d12, q11, #2
|
|
vqrshrun.s16 d13, q12, #2
|
|
mov r14, #0
|
|
vdup.32 q13 , r14
|
|
mov r14, #-1
|
|
vmov.i32 d26[0], r14
|
|
|
|
diag_dl:
|
|
ands r10, r8, #0x08 @DIAG_DL sad ??
|
|
beq diag_dr
|
|
|
|
vext.8 q15, q6, q6, #5
|
|
vbit.32 d14, d30, d26
|
|
vext.8 q15, q6, q6, #15
|
|
vbit.32 d15, d31, d26
|
|
vext.8 q15, q6, q6, #2
|
|
vext.32 q14, q13, q13, #3
|
|
vbit.32 d14, d30, d28
|
|
vext.8 q15, q6, q6, #4
|
|
vbit.32 d15, d30, d28
|
|
vabdl.u8 q14, d14, d20
|
|
subs r6, r1, #3
|
|
vabal.u8 q14, d15, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
lslne r6, r0, #2
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #3
|
|
|
|
diag_dr:
|
|
ands r10, r8, #16 @DIAG_DR sad ??
|
|
beq vert_r
|
|
|
|
vext.8 q15, q6, q6, #3
|
|
vbit.32 d16, d30, d26
|
|
vext.8 q15, q6, q6, #1
|
|
vbit.32 d17, d30, d26
|
|
vext.8 q15, q6, q6, #4
|
|
vext.32 q14, q13, q13, #3
|
|
vbit.32 d17, d31, d28
|
|
vext.8 q15, q6, q6, #6
|
|
vbit.32 d16, d31, d28
|
|
vabdl.u8 q14, d16, d20
|
|
subs r6, r1, #4
|
|
vabal.u8 q14, d17, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
lslne r6, r0, #2
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #4
|
|
|
|
vert_r:
|
|
ands r10, r8, #32 @VERT_R sad ??
|
|
beq horz_d
|
|
vext.8 q15, q5, q5, #4
|
|
vbit.32 d18, d30, d26
|
|
vext.8 q15, q5, q5, #3
|
|
vbit.32 d19, d30, d26
|
|
vext.32 q14, q13, q13, #3
|
|
vext.8 q15, q6, q6, #15
|
|
vbit.32 d18, d30, d28
|
|
vext.8 q15, q6, q6, #14
|
|
vbit.32 d19, d30, d28
|
|
mov r14, #0
|
|
vdup.32 q14 , r14
|
|
mov r14, #0xff
|
|
vmov.i8 d28[0], r14
|
|
vext.8 q15, q6, q6, #2
|
|
vbit.32 d19, d30, d28
|
|
vext.32 q14, q14, q14, #3
|
|
subs r6, r1, #5
|
|
vext.8 q15, q6, q6, #13
|
|
vbit.32 d19, d30, d28
|
|
lslne r6, r0, #2
|
|
vabdl.u8 q14, d18, d20
|
|
vabal.u8 q14, d19, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #5
|
|
|
|
horz_d:
|
|
vmov.8 q1, q5
|
|
vmov.8 q15, q6
|
|
vzip.8 q1, q15
|
|
|
|
ands r10, r8, #64 @HORZ_D sad ??
|
|
beq vert_l
|
|
vext.8 q15, q6, q6, #2
|
|
vbit.32 d8, d30, d26
|
|
mov r14, #0
|
|
vdup.32 q14 , r14
|
|
mov r14, #0xff
|
|
vmov.i8 d28[0], r14
|
|
vext.8 q15, q5, q5, #3
|
|
vbit.32 d8, d30, d28
|
|
vext.8 q15, q1, q1, #2
|
|
vbit.32 d9, d30, d26
|
|
vext.32 q14, q13, q13, #3
|
|
vbit.32 d8, d2, d28
|
|
subs r6, r1, #6
|
|
vext.8 q15, q1, q1, #12
|
|
vbit.32 d9, d30, d28
|
|
vabdl.u8 q14, d8, d20
|
|
vabal.u8 q14, d9, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
lslne r6, r0, #2
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #6
|
|
vert_l:
|
|
ands r10, r8, #128 @VERT_L sad ??
|
|
beq horz_u
|
|
vext.8 q15, q5, q5, #5
|
|
vbit.32 d24, d30, d26
|
|
vext.8 q15, q15, q15, #1
|
|
vbit.32 d25, d30, d26
|
|
vext.8 q15, q6, q6, #1
|
|
vext.32 q14, q13, q13, #3
|
|
vbit.32 d24, d30, d28
|
|
vext.8 q15, q15, q15, #1
|
|
subs r6, r1, #7
|
|
vbit.32 d25, d30, d28
|
|
vabdl.u8 q14, d24, d20
|
|
vabal.u8 q14, d25, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
lslne r6, r0, #2
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #7
|
|
|
|
horz_u:
|
|
ands r10, r8, #256 @HORZ_U sad ??
|
|
beq pred
|
|
vrev64.8 q5, q1
|
|
vdup.8 q1, d0[0]
|
|
vext.8 q6, q6, #7
|
|
mov r14, #0
|
|
vdup.32 q14 , r14
|
|
mov r14, #0xff
|
|
vmov.i8 d28[0], r14
|
|
vbit.32 d11, d13, d28
|
|
movw r14, #0xffff
|
|
vmov.i16 d28[0], r14
|
|
vext.8 q6, q5, q5, #7
|
|
subs r6, r1, #8
|
|
vbit.32 d3, d12, d28
|
|
vext.8 q6, q5, q5, #3
|
|
vbit.32 d2, d12, d26
|
|
vext.32 q14, q13, q13, #3
|
|
vext.8 q6, q5, q5, #1
|
|
vbit.32 d2, d12, d28
|
|
vabdl.u8 q14, d2, d20
|
|
vabal.u8 q14, d3, d21
|
|
vadd.i16 d28, d29, d28
|
|
vpaddl.u16 d28, d28 @
|
|
lslne r6, r0, #2
|
|
vpaddl.u32 d28, d28 @/
|
|
vmov.u32 r9, d28[0] @
|
|
|
|
|
|
moveq r6, r0 @
|
|
add r9, r6, r9
|
|
|
|
subs r6, r11, r9
|
|
movgt r11, r9
|
|
movgt r12, #8
|
|
|
|
pred: @/*dOING FINAL PREDICTION*/
|
|
@---------------------------
|
|
ldr r7, [sp, #116] @r7 = pu4_sadmin
|
|
ldr r6, [sp, #112] @ R6 =MODE
|
|
@--------------------------
|
|
str r11, [r7] @/STORING MIN SAD*/
|
|
str r12, [r6] @/FINAL MODE*/
|
|
|
|
|
|
ldr r3, scratch_intrapred_luma_4x4_prediction_addr1
|
|
scrintra_4x4:
|
|
add r3, r3, pc
|
|
lsl r12, r12, #2
|
|
add r3, r3, r12
|
|
|
|
ldr r5, [r3]
|
|
and r5, r5, #0xfffffffe
|
|
|
|
bx r5
|
|
|
|
|
|
ver:
|
|
vext.8 q0, q0, q0, #1
|
|
vdup.32 q15, d0[1]
|
|
b store
|
|
|
|
hor:
|
|
vmov.32 q15, q3
|
|
b store
|
|
|
|
d_c:
|
|
vdup.8 q15, r4
|
|
b store
|
|
|
|
dia_dl:
|
|
vmov.32 q15, q7
|
|
b store
|
|
|
|
dia_dr:
|
|
vmov.32 q15, q8
|
|
b store
|
|
|
|
ver_r:
|
|
vmov.32 q15, q9
|
|
b store
|
|
|
|
hor_d:
|
|
vmov.32 q15, q4
|
|
b store
|
|
|
|
ver_l:
|
|
vmov.32 q15, q12
|
|
b store
|
|
|
|
hor_u:
|
|
vmov.32 q15, q1
|
|
|
|
store: @/* storing to pu1_dst*/
|
|
|
|
ldr r4, [sp, #104] @r4 = dst_strd,
|
|
|
|
vst1.32 {d30[0]}, [r2], r4
|
|
vst1.32 {d30[1]}, [r2], r4
|
|
vst1.32 {d31[0]}, [r2], r4
|
|
vst1.32 {d31[1]}, [r2], r4
|
|
|
|
|
|
end_func:
|
|
vpop {d8-d15}
|
|
ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
|
|
|
|
|
|
|
|
|
|
|