android13/external/libmpeg2/common/arm/ideint_cac_a9.s

216 lines
5.2 KiB
ArmAsm

@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@******************************************************************************
@*
@* @brief
@* This file contains definitions of routines for spatial filter
@*
@* @author
@* Ittiam
@*
@* @par List of Functions:
@* - ideint_cac_8x8_a9()
@*
@* @remarks
@* None
@*
@*******************************************************************************
@******************************************************************************
@*
@* @brief Calculates Combing Artifact
@*
@* @par Description
@* This functions calculates combing artifact check (CAC) for given two fields
@*
@* @param[in] pu1_top
@* UWORD8 pointer to top field
@*
@* @param[in] pu1_bot
@* UWORD8 pointer to bottom field
@*
@* @param[in] top_strd
@* Top field stride
@*
@* @param[in] bot_strd
@* Bottom field stride
@*
@* @returns
@* None
@*
@* @remarks
@*
@******************************************************************************
.global ideint_cac_8x8_a9
ideint_cac_8x8_a9:
stmfd sp!, {r4-r10, lr}
vpush {d9}
@ Load first row of top
vld1.u8 d28, [r0], r2
@ Load first row of bottom
vld1.u8 d29, [r1], r3
@ Load second row of top
vld1.u8 d30, [r0], r2
@ Load second row of bottom
vld1.u8 d31, [r1], r3
@ Calculate row based adj and alt values
@ Get row sums
vpaddl.u8 q0, q14
vpaddl.u8 q1, q15
vpaddl.u16 q0, q0
vpaddl.u16 q1, q1
@ Both q0 and q1 have four 32 bit sums corresponding to first 4 rows
@ Pack q0 and q1 into a single register (sum does not exceed 16bits)
vshl.u32 q8, q1, #16
vorr.u32 q8, q0, q8
@ q8 now contains 8 sums
@ Load third row of top
vld1.u8 d24, [r0], r2
@ Load third row of bottom
vld1.u8 d25, [r1], r3
@ Load fourth row of top
vld1.u8 d26, [r0], r2
@ Load fourth row of bottom
vld1.u8 d27, [r1], r3
@ Get row sums
vpaddl.u8 q2, q12
vpaddl.u8 q3, q13
vpaddl.u16 q2, q2
vpaddl.u16 q3, q3
@ Both q2 and q3 have four 32 bit sums corresponding to last 4 rows
@ Pack q2 and q3 into a single register (sum does not exceed 16bits)
vshl.u32 q9, q3, #16
vorr.u32 q9, q2, q9
@ q9 now contains 8 sums
@ Compute absolute diff between top and bottom row sums
vabd.u16 d16, d16, d17
vabd.u16 d17, d18, d19
@ RSUM_CSUM_THRESH
vmov.u16 q9, #20
@ Eliminate values smaller than RSUM_CSUM_THRESH
vcge.u16 q10, q8, q9
vand.u16 q10, q8, q10
@ q10 now contains 8 absolute diff of sums above the threshold
@ Compute adj
vadd.u16 d20, d20, d21
@ d20 has four adj values for two sub-blocks
@ Compute alt
vabd.u32 q0, q0, q1
vabd.u32 q2, q2, q3
vadd.u32 q0, q0, q2
vadd.u32 d21, d0, d1
@ d21 has two values for two sub-blocks
@ Calculate column based adj and alt values
vrhadd.u8 q0, q14, q15
vrhadd.u8 q1, q12, q13
vrhadd.u8 q0, q0, q1
vabd.u8 d0, d0, d1
@ RSUM_CSUM_THRESH >> 2
vmov.u8 d9, #5
@ Eliminate values smaller than RSUM_CSUM_THRESH >> 2
vcge.u8 d1, d0, d9
vand.u8 d0, d0, d1
@ d0 now contains 8 absolute diff of sums above the threshold
vpaddl.u8 d0, d0
vshl.u16 d0, d0, #2
@ Add row based adj
vadd.u16 d20, d0, d20
vpaddl.u16 d20, d20
@ d20 now contains 2 adj values
vrhadd.u8 d0, d28, d29
vrhadd.u8 d2, d24, d25
vrhadd.u8 d0, d0, d2
vrhadd.u8 d1, d30, d31
vrhadd.u8 d3, d26, d27
vrhadd.u8 d1, d1, d3
vabd.u8 d0, d0, d1
vpaddl.u8 d0, d0
vshl.u16 d0, d0, #2
vpaddl.u16 d0, d0
vadd.u32 d21, d0, d21
@ d21 now contains 2 alt values
@ SAD_BIAS_MULT_SHIFT
vshr.u32 d0, d21, #3
vadd.u32 d21, d21, d0
@ SAD_BIAS_ADDITIVE >> 1
vmov.u32 d0, #4
vadd.u32 d21, d21, d0
vclt.u32 d0, d21, d20
vpaddl.u32 d0, d0
vmov.u32 r0, d0[0]
cmp r0, #0
movne r0, #1
vpop {d9}
ldmfd sp!, {r4-r10, pc}