994 lines
27 KiB
ArmAsm
994 lines
27 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
//**
|
|
|
|
///**
|
|
//******************************************************************************
|
|
//*
|
|
//*
|
|
//* @brief
|
|
//* This file contains definitions of routines that compute distortion
|
|
//* between two macro/sub blocks of identical dimensions
|
|
//*
|
|
//* @author
|
|
//* Ittiam
|
|
//*
|
|
//* @par List of Functions:
|
|
//* - ime_compute_sad_16x16()
|
|
//* - ime_compute_sad_8x8()
|
|
//* - ime_compute_sad_4x4()
|
|
//* - ime_compute_sad_16x8()
|
|
//* - ime_compute_satqd_16x16_lumainter_av8()
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//
|
|
|
|
|
|
///**
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief computes distortion (SAD) between 2 16x16 blocks (fast mode)
|
|
//*
|
|
//* @par Description
|
|
//* This functions computes SAD between 2 16x16 blocks. There is a provision
|
|
//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
|
|
//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* UWORD8 pointer to the destination
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//* @param[in] i4_max_sad
|
|
//* integer maximum allowed distortion
|
|
//*
|
|
//* @param[in] pi4_mb_distortion
|
|
//* integer evaluated sad
|
|
//*
|
|
//* @remarks
|
|
//*
|
|
//******************************************************************************
|
|
//*/
|
|
.text
|
|
.p2align 2
|
|
|
|
.macro push_v_regs
|
|
stp d8, d9, [sp, #-16]!
|
|
stp d10, d11, [sp, #-16]!
|
|
stp d12, d13, [sp, #-16]!
|
|
stp d14, d15, [sp, #-16]!
|
|
.endm
|
|
.macro pop_v_regs
|
|
ldp d14, d15, [sp], #16
|
|
ldp d12, d13, [sp], #16
|
|
ldp d10, d11, [sp], #16
|
|
ldp d8, d9, [sp], #16
|
|
.endm
|
|
|
|
.global ime_compute_sad_16x16_fast_av8
|
|
ime_compute_sad_16x16_fast_av8:
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
lsl x2, x2, #1
|
|
lsl x3, x3, #1
|
|
|
|
mov x6, #2
|
|
movi v30.8h, #0
|
|
|
|
core_loop_ime_compute_sad_16x16_fast_av8:
|
|
|
|
ld1 {v0.16b}, [x0], x2
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x0], x2
|
|
ld1 {v3.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v0.8b, v1.8b
|
|
uabal2 v30.8h, v0.16b, v1.16b
|
|
|
|
uabal v30.8h, v2.8b, v3.8b
|
|
uabal2 v30.8h, v2.16b, v3.16b
|
|
|
|
ld1 {v4.16b}, [x0], x2
|
|
ld1 {v5.16b}, [x1], x3
|
|
ld1 {v6.16b}, [x0], x2
|
|
ld1 {v7.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v4.8b, v5.8b
|
|
uabal2 v30.8h, v4.16b, v5.16b
|
|
|
|
uabal v30.8h, v6.8b, v7.8b
|
|
uabal2 v30.8h, v6.16b, v7.16b
|
|
|
|
subs x6, x6, #1
|
|
bne core_loop_ime_compute_sad_16x16_fast_av8
|
|
|
|
|
|
addp v30.8h, v30.8h, v30.8h
|
|
uaddlp v30.4s, v30.8h
|
|
addp v30.2s, v30.2s, v30.2s
|
|
shl v30.2s, v30.2s, #1
|
|
|
|
st1 {v30.s}[0], [x5]
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
///**
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief computes distortion (SAD) between 2 16x8 blocks
|
|
//*
|
|
//*
|
|
//* @par Description
|
|
//* This functions computes SAD between 2 16x8 blocks. There is a provision
|
|
//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
|
|
//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* UWORD8 pointer to the destination
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//* @param[in] u4_max_sad
|
|
//* integer maximum allowed distortion
|
|
//*
|
|
//* @param[in] pi4_mb_distortion
|
|
//* integer evaluated sad
|
|
//*
|
|
//* @remarks
|
|
//*
|
|
//******************************************************************************
|
|
//*/
|
|
//
|
|
.global ime_compute_sad_16x8_av8
|
|
ime_compute_sad_16x8_av8:
|
|
|
|
//chheck what stride incremtn to use
|
|
//earlier code did not have this lsl
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
mov x6, #2
|
|
movi v30.8h, #0
|
|
|
|
core_loop_ime_compute_sad_16x8_av8:
|
|
|
|
ld1 {v0.16b}, [x0], x2
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x0], x2
|
|
ld1 {v3.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v0.8b, v1.8b
|
|
uabal2 v30.8h, v0.16b, v1.16b
|
|
|
|
uabal v30.8h, v2.8b, v3.8b
|
|
uabal2 v30.8h, v2.16b, v3.16b
|
|
|
|
ld1 {v4.16b}, [x0], x2
|
|
ld1 {v5.16b}, [x1], x3
|
|
ld1 {v6.16b}, [x0], x2
|
|
ld1 {v7.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v4.8b, v5.8b
|
|
uabal2 v30.8h, v4.16b, v5.16b
|
|
|
|
uabal v30.8h, v6.8b, v7.8b
|
|
uabal2 v30.8h, v6.16b, v7.16b
|
|
|
|
subs x6, x6, #1
|
|
bne core_loop_ime_compute_sad_16x8_av8
|
|
|
|
|
|
addp v30.8h, v30.8h, v30.8h
|
|
uaddlp v30.4s, v30.8h
|
|
addp v30.2s, v30.2s, v30.2s
|
|
|
|
st1 {v30.s}[0], [x5]
|
|
pop_v_regs
|
|
ret
|
|
|
|
///**
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief computes distortion (SAD) between 2 16x16 blocks with early exit
|
|
//*
|
|
//* @par Description
|
|
//* This functions computes SAD between 2 16x16 blocks. There is a provision
|
|
//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
|
|
//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* UWORD8 pointer to the destination
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//* @param[in] i4_max_sad
|
|
//* integer maximum allowed distortion
|
|
//*
|
|
//* @param[in] pi4_mb_distortion
|
|
//* integer evaluated sad
|
|
//*
|
|
//* @remarks
|
|
//*
|
|
//******************************************************************************
|
|
//*/
|
|
|
|
.global ime_compute_sad_16x16_ea8_av8
|
|
ime_compute_sad_16x16_ea8_av8:
|
|
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
movi v30.8h, #0
|
|
|
|
add x7, x0, x2
|
|
add x8, x1, x3
|
|
|
|
lsl x2, x2, #1
|
|
lsl x3, x3, #1
|
|
|
|
ld1 {v0.16b}, [x0], x2
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x0], x2
|
|
ld1 {v3.16b}, [x1], x3
|
|
ld1 {v8.16b}, [x0], x2
|
|
ld1 {v9.16b}, [x1], x3
|
|
ld1 {v10.16b}, [x0], x2
|
|
ld1 {v11.16b}, [x1], x3
|
|
ld1 {v12.16b}, [x0], x2
|
|
ld1 {v13.16b}, [x1], x3
|
|
ld1 {v14.16b}, [x0], x2
|
|
ld1 {v15.16b}, [x1], x3
|
|
ld1 {v16.16b}, [x0], x2
|
|
ld1 {v17.16b}, [x1], x3
|
|
ld1 {v18.16b}, [x0], x2
|
|
ld1 {v19.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v0.8b, v1.8b
|
|
uabal2 v30.8h, v0.16b, v1.16b
|
|
|
|
uabal v30.8h, v2.8b, v3.8b
|
|
uabal2 v30.8h, v2.16b, v3.16b
|
|
|
|
uabal v30.8h, v8.8b, v9.8b
|
|
uabal2 v30.8h, v8.16b, v9.16b
|
|
|
|
uabal v30.8h, v10.8b, v11.8b
|
|
uabal2 v30.8h, v10.16b, v11.16b
|
|
|
|
uabal v30.8h, v12.8b, v13.8b
|
|
uabal2 v30.8h, v12.16b, v13.16b
|
|
|
|
uabal v30.8h, v14.8b, v15.8b
|
|
uabal2 v30.8h, v14.16b, v15.16b
|
|
|
|
uabal v30.8h, v16.8b, v17.8b
|
|
uabal2 v30.8h, v16.16b, v17.16b
|
|
|
|
uabal v30.8h, v18.8b, v19.8b
|
|
uabal2 v30.8h, v18.16b, v19.16b
|
|
|
|
addp v31.8h, v30.8h, v30.8h
|
|
uaddlp v31.4s, v31.8h
|
|
addp v31.2s, v31.2s, v31.2s
|
|
mov w6, v31.s[0]
|
|
cmp w6, w4
|
|
bgt end_func_16x16
|
|
|
|
//do the stuff again
|
|
ld1 {v0.16b}, [x7], x2
|
|
ld1 {v1.16b}, [x8], x3
|
|
ld1 {v2.16b}, [x7], x2
|
|
ld1 {v3.16b}, [x8], x3
|
|
ld1 {v8.16b}, [x7], x2
|
|
ld1 {v9.16b}, [x8], x3
|
|
ld1 {v10.16b}, [x7], x2
|
|
ld1 {v11.16b}, [x8], x3
|
|
ld1 {v12.16b}, [x7], x2
|
|
ld1 {v13.16b}, [x8], x3
|
|
ld1 {v14.16b}, [x7], x2
|
|
ld1 {v15.16b}, [x8], x3
|
|
ld1 {v16.16b}, [x7], x2
|
|
ld1 {v17.16b}, [x8], x3
|
|
ld1 {v18.16b}, [x7], x2
|
|
ld1 {v19.16b}, [x8], x3
|
|
|
|
uabal v30.8h, v0.8b, v1.8b
|
|
uabal2 v30.8h, v0.16b, v1.16b
|
|
|
|
uabal v30.8h, v2.8b, v3.8b
|
|
uabal2 v30.8h, v2.16b, v3.16b
|
|
|
|
uabal v30.8h, v8.8b, v9.8b
|
|
uabal2 v30.8h, v8.16b, v9.16b
|
|
|
|
uabal v30.8h, v10.8b, v11.8b
|
|
uabal2 v30.8h, v10.16b, v11.16b
|
|
|
|
uabal v30.8h, v12.8b, v13.8b
|
|
uabal2 v30.8h, v12.16b, v13.16b
|
|
|
|
uabal v30.8h, v14.8b, v15.8b
|
|
uabal2 v30.8h, v14.16b, v15.16b
|
|
|
|
uabal v30.8h, v16.8b, v17.8b
|
|
uabal2 v30.8h, v16.16b, v17.16b
|
|
|
|
uabal v30.8h, v18.8b, v19.8b
|
|
uabal2 v30.8h, v18.16b, v19.16b
|
|
|
|
addp v31.8h, v30.8h, v30.8h
|
|
uaddlp v31.4s, v31.8h
|
|
addp v31.2s, v31.2s, v31.2s
|
|
|
|
end_func_16x16:
|
|
st1 {v31.s}[0], [x5]
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : ime_calculate_sad2_prog_av8()
|
|
////
|
|
//// Detail Description : This function find the sad values of 4 Progressive MBs
|
|
//// at one shot
|
|
////
|
|
//// Platform : CortexAv8/NEON .
|
|
////
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
.global ime_calculate_sad2_prog_av8
|
|
ime_calculate_sad2_prog_av8:
|
|
|
|
// x0 = ref1 <UWORD8 *>
|
|
// x1 = ref2 <UWORD8 *>
|
|
// x2 = src <UWORD8 *>
|
|
// w3 = RefBufferWidth <UWORD32>
|
|
// w4 = CurBufferWidth <UWORD32>
|
|
// x5 = psad <UWORD32 *>
|
|
push_v_regs
|
|
sxtw x3, w3
|
|
sxtw x4, w4
|
|
mov x6, #8
|
|
movi v30.8h, #0
|
|
movi v31.8h, #0
|
|
|
|
core_loop_ime_calculate_sad2_prog_av8:
|
|
|
|
ld1 {v0.16b}, [x0], x3
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x3], x4
|
|
|
|
ld1 {v3.16b}, [x0], x3
|
|
ld1 {v4.16b}, [x1], x3
|
|
ld1 {v5.16b}, [x3], x4
|
|
|
|
|
|
uabal v30.8h, v0.8b, v2.8b
|
|
uabal2 v30.8h, v0.16b, v2.16b
|
|
uabal v31.8h, v1.8b, v2.8b
|
|
uabal2 v31.8h, v1.16b, v2.16b
|
|
|
|
uabal v30.8h, v3.8b, v5.8b
|
|
uabal2 v30.8h, v3.16b, v5.16b
|
|
uabal v31.8h, v4.8b, v5.8b
|
|
uabal2 v31.8h, v4.16b, v5.16b
|
|
|
|
|
|
ld1 {v6.16b}, [x0], x3
|
|
ld1 {v7.16b}, [x1], x3
|
|
ld1 {v8.16b}, [x3], x4
|
|
|
|
ld1 {v9.16b}, [x0], x3
|
|
ld1 {v10.16b}, [x1], x3
|
|
ld1 {v11.16b}, [x3], x4
|
|
|
|
uabal v30.8h, v6.8b, v8.8b
|
|
uabal2 v30.8h, v6.16b, v8.16b
|
|
uabal v31.8h, v7.8b, v8.8b
|
|
uabal2 v31.8h, v7.16b, v8.16b
|
|
|
|
uabal v30.8h, v9.8b, v11.8b
|
|
uabal2 v30.8h, v9.16b, v11.16b
|
|
uabal v31.8h, v10.8b, v11.8b
|
|
uabal2 v31.8h, v0.16b, v11.16b
|
|
|
|
subs x6, x6, #1
|
|
bne core_loop_ime_calculate_sad2_prog_av8
|
|
|
|
addp v30.8h, v30.8h, v31.8h
|
|
uaddlp v30.4s, v30.8h
|
|
addp v30.2s, v30.2s, v30.2s
|
|
shl v30.2s, v30.2s, #1
|
|
|
|
st1 {v30.2s}, [x5]
|
|
pop_v_regs
|
|
ret
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : Calculate_Mad3_prog()
|
|
////
|
|
//// Detail Description : This function find the sad values of 4 Progressive MBs
|
|
//// at one shot
|
|
////
|
|
//// Platform : CortexA8/NEON .
|
|
////
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
.global ime_calculate_sad3_prog_av8
|
|
ime_calculate_sad3_prog_av8:
|
|
|
|
// x0 = ref1 <UWORD8 *>
|
|
// x1 = ref2 <UWORD8 *>
|
|
// x2 = ref3 <UWORD8 *>
|
|
// x3 = src <UWORD8 *>
|
|
// w4 = RefBufferWidth <UWORD32>
|
|
// w5 = CurBufferWidth <UWORD32>
|
|
// x6 = psad <UWORD32 *>
|
|
|
|
|
|
push_v_regs
|
|
sxtw x4, w4
|
|
sxtw x5, w5
|
|
mov x7, #16
|
|
movi v29.8h, #0
|
|
movi v30.8h, #0
|
|
movi v31.8h, #0
|
|
|
|
core_loop_ime_calculate_sad3_prog_av8:
|
|
|
|
ld1 {v0.16b}, [x0], x4
|
|
ld1 {v1.16b}, [x1], x4
|
|
ld1 {v2.16b}, [x2], x4
|
|
ld1 {v3.16b}, [x3], x5
|
|
|
|
uabal v29.8h, v0.8b, v3.8b
|
|
uabal2 v29.8h, v0.16b, v3.16b
|
|
uabal v30.8h, v1.8b, v3.8b
|
|
uabal2 v30.8h, v1.16b, v3.16b
|
|
uabal v31.8h, v2.8b, v3.8b
|
|
uabal2 v31.8h, v2.16b, v3.16b
|
|
|
|
ld1 {v4.16b}, [x0], x4
|
|
ld1 {v5.16b}, [x1], x4
|
|
ld1 {v6.16b}, [x2], x4
|
|
ld1 {v7.16b}, [x3], x5
|
|
|
|
uabal v29.8h, v4.8b, v7.8b
|
|
uabal2 v29.8h, v4.16b, v7.16b
|
|
uabal v30.8h, v5.8b, v7.8b
|
|
uabal2 v30.8h, v5.16b, v7.16b
|
|
uabal v31.8h, v6.8b, v7.8b
|
|
uabal2 v31.8h, v6.16b, v7.16b
|
|
|
|
subs x7, x7, #1
|
|
bne core_loop_ime_calculate_sad3_prog_av8
|
|
|
|
addp v30.8h, v30.8h, v31.8h
|
|
uaddlp v30.4s, v30.8h
|
|
addp v30.2s, v30.2s, v30.2s
|
|
shl v30.2s, v30.2s, #1
|
|
|
|
st1 {v30.2s}, [x6]
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
///**
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief computes distortion (SAD) for sub-pel motion estimation
|
|
//*
|
|
//* @par Description
|
|
//* This functions computes SAD for all the 8 half pel points
|
|
//*
|
|
//* @param[out] pi4_sad
|
|
//* integer evaluated sad
|
|
//* pi4_sad[0] - half x
|
|
//* pi4_sad[1] - half x - 1
|
|
//* pi4_sad[2] - half y
|
|
//* pi4_sad[3] - half y - 1
|
|
//* pi4_sad[4] - half xy
|
|
//* pi4_sad[5] - half xy - 1
|
|
//* pi4_sad[6] - half xy - strd
|
|
//* pi4_sad[7] - half xy - 1 - strd
|
|
//*
|
|
//* @remarks
|
|
//*
|
|
//******************************************************************************
|
|
//*/
|
|
|
|
.text
|
|
.p2align 2
|
|
|
|
.global ime_sub_pel_compute_sad_16x16_av8
|
|
ime_sub_pel_compute_sad_16x16_av8:
|
|
push_v_regs
|
|
sxtw x4, w4
|
|
sxtw x5, w5
|
|
sub x7, x1, #1 //x left
|
|
sub x8, x2, x5 //y top
|
|
sub x9, x3, #1 //xy left
|
|
sub x10, x3, x5 //xy top
|
|
sub x11, x10, #1 //xy top left
|
|
|
|
movi v24.8h, #0
|
|
movi v25.8h, #0
|
|
movi v26.8h, #0
|
|
movi v27.8h, #0
|
|
movi v28.8h, #0
|
|
movi v29.8h, #0
|
|
movi v30.8h, #0
|
|
movi v31.8h, #0
|
|
|
|
mov x12, #16
|
|
core_loop_ime_sub_pel_compute_sad_16x16_av8:
|
|
|
|
ld1 {v0.16b}, [x0], x4 //src
|
|
ld1 {v1.16b}, [x1], x5 //x
|
|
ld1 {v2.16b}, [x7], x5 //x left
|
|
ld1 {v3.16b}, [x2], x5 //y
|
|
ld1 {v9.16b}, [x8], x5 //y top
|
|
ld1 {v10.16b}, [x3], x5 //xy
|
|
ld1 {v11.16b}, [x9], x5 //xy left
|
|
ld1 {v12.16b}, [x10], x5 //xy top
|
|
ld1 {v13.16b}, [x11], x5 //xy top left
|
|
|
|
uabal v24.8h, v0.8b, v1.8b
|
|
uabal2 v24.8h, v0.16b, v1.16b
|
|
uabal v25.8h, v0.8b, v2.8b
|
|
uabal2 v25.8h, v0.16b, v2.16b
|
|
uabal v26.8h, v0.8b, v3.8b
|
|
uabal2 v26.8h, v0.16b, v3.16b
|
|
uabal v27.8h, v0.8b, v9.8b
|
|
uabal2 v27.8h, v0.16b, v9.16b
|
|
uabal v28.8h, v0.8b, v10.8b
|
|
uabal2 v28.8h, v0.16b, v10.16b
|
|
uabal v29.8h, v0.8b, v11.8b
|
|
uabal2 v29.8h, v0.16b, v11.16b
|
|
uabal v30.8h, v0.8b, v12.8b
|
|
uabal2 v30.8h, v0.16b, v12.16b
|
|
uabal v31.8h, v0.8b, v13.8b
|
|
uabal2 v31.8h, v0.16b, v13.16b
|
|
|
|
subs x12, x12, #1
|
|
bne core_loop_ime_sub_pel_compute_sad_16x16_av8
|
|
|
|
addp v24.8h, v24.8h, v25.8h
|
|
addp v26.8h, v26.8h, v27.8h
|
|
addp v28.8h, v28.8h, v29.8h
|
|
addp v30.8h, v30.8h, v31.8h
|
|
|
|
uaddlp v24.4s, v24.8h
|
|
uaddlp v26.4s, v26.8h
|
|
uaddlp v28.4s, v28.8h
|
|
uaddlp v30.4s, v30.8h
|
|
|
|
addp v24.4s, v24.4s, v26.4s
|
|
addp v25.4s, v28.4s, v30.4s
|
|
|
|
st1 {v24.4s-v25.4s}, [x6]
|
|
|
|
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
///**
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief computes distortion (SAD) between 2 16x16 blocks
|
|
//*
|
|
//* @par Description
|
|
//* This functions computes SAD between 2 16x16 blocks. There is a provision
|
|
//* for early exit if the up-to computed SAD exceeds maximum allowed SAD. To
|
|
//* compute the distortion of the entire block set u4_max_sad to USHRT_MAX.
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* UWORD8 pointer to the destination
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//* @param[in] i4_max_sad
|
|
//* integer maximum allowed distortion
|
|
//*
|
|
//* @param[in] pi4_mb_distortion
|
|
//* integer evaluated sad
|
|
//*
|
|
//* @remarks
|
|
//*
|
|
//******************************************************************************
|
|
//*/
|
|
.global ime_compute_sad_16x16_av8
|
|
ime_compute_sad_16x16_av8:
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
mov x6, #4
|
|
movi v30.8h, #0
|
|
|
|
core_loop_ime_compute_sad_16x16_av8:
|
|
|
|
ld1 {v0.16b}, [x0], x2
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x0], x2
|
|
ld1 {v3.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v0.8b, v1.8b
|
|
uabal2 v30.8h, v0.16b, v1.16b
|
|
|
|
uabal v30.8h, v2.8b, v3.8b
|
|
uabal2 v30.8h, v2.16b, v3.16b
|
|
|
|
ld1 {v4.16b}, [x0], x2
|
|
ld1 {v5.16b}, [x1], x3
|
|
ld1 {v6.16b}, [x0], x2
|
|
ld1 {v7.16b}, [x1], x3
|
|
|
|
uabal v30.8h, v4.8b, v5.8b
|
|
uabal2 v30.8h, v4.16b, v5.16b
|
|
|
|
uabal v30.8h, v6.8b, v7.8b
|
|
uabal2 v30.8h, v6.16b, v7.16b
|
|
|
|
subs x6, x6, #1
|
|
bne core_loop_ime_compute_sad_16x16_av8
|
|
|
|
|
|
addp v30.8h, v30.8h, v30.8h
|
|
uaddlp v30.4s, v30.8h
|
|
addp v30.2s, v30.2s, v30.2s
|
|
|
|
st1 {v30.s}[0], [x5]
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : Calculate_Mad4_prog()
|
|
////
|
|
//// Detail Description : This function find the sad values of 4 Progressive MBs
|
|
//// at one shot
|
|
////
|
|
//// Platform : CortexA8/NEON .
|
|
////
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
.global ime_calculate_sad4_prog_av8
|
|
ime_calculate_sad4_prog_av8:
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
sub x5, x0, #1 //left
|
|
add x6, x0, #1 //right
|
|
sub x7, x0, x2 //top
|
|
add x8, x0, x2 //bottom
|
|
|
|
movi v28.8h, #0
|
|
movi v29.8h, #0
|
|
movi v30.8h, #0
|
|
movi v31.8h, #0
|
|
|
|
mov x9, #16
|
|
core_loop_ime_calculate_sad4_prog_av8:
|
|
|
|
ld1 {v0.16b}, [x1], x3
|
|
ld1 {v1.16b}, [x5], x2
|
|
ld1 {v2.16b}, [x6], x2
|
|
ld1 {v3.16b}, [x7], x2
|
|
ld1 {v9.16b}, [x8], x2
|
|
|
|
uabal v28.8h, v0.8b, v1.8b
|
|
uabal2 v28.8h, v0.16b, v1.16b
|
|
uabal v29.8h, v0.8b, v2.8b
|
|
uabal2 v29.8h, v0.16b, v2.16b
|
|
uabal v30.8h, v0.8b, v3.8b
|
|
uabal2 v30.8h, v0.16b, v3.16b
|
|
uabal v31.8h, v0.8b, v9.8b
|
|
uabal2 v31.8h, v0.16b, v9.16b
|
|
|
|
subs x9, x9, #1
|
|
bne core_loop_ime_calculate_sad4_prog_av8
|
|
|
|
addp v28.8h, v28.8h, v29.8h
|
|
addp v30.8h, v30.8h, v31.8h
|
|
|
|
uaddlp v28.4s, v28.8h
|
|
uaddlp v30.4s, v30.8h
|
|
|
|
addp v28.4s, v28.4s, v30.4s
|
|
st1 {v28.4s}, [x4]
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
//*****************************************************************************
|
|
//*
|
|
//* Function Name : ime_compute_satqd_16x16_lumainter_av8
|
|
//* Description : This fucntion computes SAD for a 16x16 block.
|
|
// : It also computes if any 4x4 block will have a nonzero coefficent after transform and quant
|
|
//
|
|
// Arguments : x0 :pointer to src buffer
|
|
// x1 :pointer to est buffer
|
|
// x2 :source stride
|
|
// x3 :est stride
|
|
// STACk :Threshold,distotion,is_nonzero
|
|
//*
|
|
//* Values Returned : NONE
|
|
//*
|
|
//* Register Usage : x0-x11
|
|
//* Stack Usage :
|
|
//* Cycles : Around
|
|
//* Interruptiaility : Interruptable
|
|
//*
|
|
//* Known Limitations
|
|
//* \Assumptions :
|
|
//*
|
|
//* Revision History :
|
|
//* DD MM YYYY Author(s) Changes
|
|
//* 14 04 2014 Harinarayanan K K First version
|
|
//*
|
|
//*****************************************************************************
|
|
.global ime_compute_satqd_16x16_lumainter_av8
|
|
ime_compute_satqd_16x16_lumainter_av8:
|
|
//x0 :pointer to src buffer
|
|
//x1 :pointer to est buffer
|
|
//w2 :Source stride
|
|
//w3 :Pred stride
|
|
//x4 :Threshold pointer
|
|
//x5 :Distortion,ie SAD
|
|
//x6 :is nonzero
|
|
//x7 :loop counter
|
|
push_v_regs
|
|
sxtw x2, w2
|
|
sxtw x3, w3
|
|
stp d8, d9, [sp, #-16]!
|
|
stp d10, d11, [sp, #-16]!
|
|
stp d12, d13, [sp, #-16]!
|
|
stp d14, d15, [sp, #-16]!
|
|
|
|
ld1 {v30.8h}, [x4]
|
|
|
|
dup v20.4h, v30.h[1] //ls1
|
|
dup v24.4h, v30.h[0] //ls2
|
|
dup v21.4h, v30.h[5] //ls3
|
|
dup v25.4h, v30.h[7] //ls4
|
|
dup v22.4h, v30.h[3] //ls5
|
|
dup v26.4h, v30.h[4] //ls6
|
|
dup v23.4h, v30.h[6] //ls7
|
|
dup v27.4h, v30.h[2] //ls8
|
|
|
|
mov v20.d[1], v24.d[0]
|
|
mov v21.d[1], v25.d[0]
|
|
mov v22.d[1], v26.d[0]
|
|
mov v23.d[1], v27.d[0]
|
|
|
|
add x4, x4, #16
|
|
ld1 {v29.h}[0], [x4]
|
|
dup v29.4h, v29.h[0]
|
|
|
|
movi v31.8h, #0
|
|
|
|
mov x7, #4
|
|
core_loop_satqd_ime_compute_satqd_16x16_lumainter:
|
|
ld1 {v0.16b}, [x0], x2
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x0], x2
|
|
ld1 {v3.16b}, [x1], x3
|
|
ld1 {v4.16b}, [x0], x2
|
|
ld1 {v5.16b}, [x1], x3
|
|
ld1 {v6.16b}, [x0], x2
|
|
ld1 {v7.16b}, [x1], x3
|
|
|
|
uabdl v10.8h, v0.8b, v1.8b
|
|
uabdl2 v15.8h, v0.16b, v1.16b
|
|
uabdl v11.8h, v2.8b, v3.8b
|
|
uabdl2 v16.8h, v2.16b, v3.16b
|
|
uabdl v12.8h, v4.8b, v5.8b
|
|
uabdl2 v17.8h, v4.16b, v5.16b
|
|
uabdl v13.8h, v6.8b, v7.8b
|
|
uabdl2 v18.8h, v6.16b, v7.16b
|
|
|
|
add v0.8h, v10.8h, v13.8h
|
|
add v1.8h, v11.8h, v12.8h
|
|
add v2.8h, v15.8h, v18.8h
|
|
add v3.8h, v16.8h, v17.8h
|
|
|
|
//v0 : S1 S4 S4 S1 A1 A4 A4 A1
|
|
//v1 : S2 S3 S3 S2 A2 A3 A3 A2
|
|
//v2 : B1 B4 B4 B1 X1 X4 X4 X1
|
|
//v3 : B3 B2 B2 B3 X3 X2 X2 X3
|
|
|
|
trn1 v4.8h, v0.8h, v1.8h
|
|
trn2 v5.8h, v0.8h, v1.8h
|
|
trn1 v6.8h, v2.8h, v3.8h
|
|
trn2 v7.8h, v2.8h, v3.8h
|
|
|
|
trn1 v0.4s, v4.4s, v6.4s
|
|
trn2 v2.4s, v4.4s, v6.4s
|
|
trn1 v1.4s, v5.4s, v7.4s
|
|
trn2 v3.4s, v5.4s, v7.4s
|
|
|
|
add v4.8h, v0.8h, v3.8h
|
|
add v5.8h, v1.8h, v2.8h
|
|
//v4 : S1 S2 B1 B2 A1 A2 X1 X2
|
|
//v5 : S4 S3 B4 B3 A4 A3 X4 X3
|
|
|
|
//compute sad for each 4x4 block
|
|
add v6.8h, v4.8h, v5.8h
|
|
addp v19.8h, v6.8h, v6.8h
|
|
//duplicate the sad into 128 bit so that we can compare using 128bit
|
|
add v31.4h, v31.4h, v19.4h
|
|
|
|
//sad_2 = sad_1<<1;
|
|
shl v28.8h, v19.8h, #1
|
|
|
|
//sad_2 - pu2_thrsh
|
|
sub v24.8h, v28.8h, v20.8h
|
|
sub v25.8h, v28.8h, v21.8h
|
|
sub v26.8h, v28.8h, v22.8h
|
|
sub v27.8h, v28.8h, v23.8h
|
|
|
|
trn1 v0.4s, v4.4s, v5.4s
|
|
trn2 v1.4s, v4.4s, v5.4s
|
|
//v0 : S1 S2 S4 S3 A1 A2 A4 A3
|
|
//v1 : B1 B2 B4 B3 X1 X2 X4 X3
|
|
|
|
trn1 v4.8h, v0.8h, v1.8h
|
|
trn2 v5.8h, v0.8h, v1.8h
|
|
//v4 : S1 B1 S4 B4 A1 X1 A4 X4
|
|
//v5 : S2 B2 S3 B3 A2 X2 A3 X3
|
|
|
|
mov v7.s[0], v4.s[1]
|
|
mov v7.s[1], v4.s[3]
|
|
mov v6.s[0], v5.s[1] // V4 //S1 B1 A1 X1
|
|
mov v6.s[1], v5.s[3] // V5 //S2 B2 A2 X2
|
|
mov v4.s[1], v4.s[2] // V6 //S3 B3 A3 X3
|
|
mov v5.s[1], v5.s[2] // V7 //S4 B4 A4 X4
|
|
|
|
shl v0.4h, v4.4h, #1 //S1<<1
|
|
shl v1.4h, v5.4h, #1 //S2<<1
|
|
shl v2.4h, v6.4h, #1 //S3<<1
|
|
shl v3.4h, v7.4h, #1 //S4<<1
|
|
|
|
add v8.4h, v5.4h, v6.4h //(s2[j] + s3[j]))
|
|
add v9.4h, v4.4h, v7.4h //(s1[j] + s4[j]))
|
|
add v10.4h, v6.4h, v7.4h //(s3[j] + s4[j]))
|
|
sub v11.4h, v6.4h, v0.4h //(s3[j] - (s1[j]<<1))
|
|
sub v12.4h, v7.4h, v1.4h //(s4[j] - (s2[j]<<1))
|
|
add v13.4h, v4.4h, v5.4h //(s1[j] + s2[j]))
|
|
sub v14.4h, v5.4h, v3.4h //(s2[j] - (s4[j]<<1)))
|
|
sub v15.4h, v4.4h, v2.4h //(s1[j] - (s3[j]<<1)))
|
|
|
|
mov v8.d[1], v9.d[0]
|
|
mov v10.d[1], v11.d[0]
|
|
mov v12.d[1], v13.d[0]
|
|
mov v14.d[1], v15.d[0]
|
|
|
|
cmge v0.8h, v24.8h, v8.8h //ls1 ls2
|
|
cmge v1.8h, v25.8h, v10.8h //ls3 ls4
|
|
cmge v2.8h, v26.8h, v12.8h //ls5 ls6
|
|
cmge v3.8h, v27.8h, v14.8h //ls7 ls8
|
|
cmge v4.4h, v19.4h, v29.4h //sad
|
|
|
|
orr v0.16b, v0.16b, v1.16b
|
|
orr v2.16b, v2.16b, v3.16b
|
|
orr v2.16b, v0.16b, v2.16b
|
|
xtn v2.8b, v2.8h
|
|
orr v2.8b, v2.8b, v4.8b
|
|
|
|
//if the comparison is non zero, out
|
|
mov x4, v2.d[0]
|
|
cmp x4, #0
|
|
bne core_loop_compute_sad_pre
|
|
|
|
subs x7, x7, #1
|
|
bne core_loop_satqd_ime_compute_satqd_16x16_lumainter
|
|
b satdq_end_func
|
|
|
|
|
|
core_loop_compute_sad:
|
|
ld1 {v0.16b}, [x0], x2
|
|
ld1 {v1.16b}, [x1], x3
|
|
ld1 {v2.16b}, [x0], x2
|
|
ld1 {v3.16b}, [x1], x3
|
|
|
|
uabal v31.8h, v0.8b, v1.8b
|
|
uabal2 v31.8h, v0.16b, v1.16b
|
|
|
|
uabal v31.8h, v2.8b, v3.8b
|
|
uabal2 v31.8h, v2.16b, v3.16b
|
|
|
|
ld1 {v4.16b}, [x0], x2
|
|
ld1 {v5.16b}, [x1], x3
|
|
ld1 {v6.16b}, [x0], x2
|
|
ld1 {v7.16b}, [x1], x3
|
|
|
|
uabal v31.8h, v4.8b, v5.8b
|
|
uabal2 v31.8h, v4.16b, v5.16b
|
|
|
|
uabal v31.8h, v6.8b, v7.8b
|
|
uabal2 v31.8h, v6.16b, v7.16b
|
|
|
|
core_loop_compute_sad_pre:
|
|
subs x7, x7, #1
|
|
bne core_loop_compute_sad
|
|
|
|
satdq_end_func:
|
|
|
|
mov x7, #1
|
|
cmp x4, #0
|
|
csel x7, x4, x7, eq
|
|
str w7, [x6]
|
|
|
|
addp v31.8h, v31.8h, v31.8h
|
|
uaddlp v31.4s, v31.8h
|
|
addp v31.2s, v31.2s, v31.2s
|
|
st1 {v31.s}[0], [x5]
|
|
|
|
|
|
ldp d14, d15, [sp], #16
|
|
ldp d12, d13, [sp], #16
|
|
ldp d10, d11, [sp], #16
|
|
ldp d8, d9, [sp], #16
|
|
pop_v_regs
|
|
ret
|