980 lines
31 KiB
C
980 lines
31 KiB
C
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
******************************************************************************/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevc_weighted_pred_neon_intr.c
|
|
*
|
|
* @brief
|
|
* Contains function definitions for weighted prediction used in inter
|
|
* prediction
|
|
*
|
|
* @author
|
|
* Parthiban V
|
|
*
|
|
* @par List of Functions:
|
|
* - ihevc_weighted_pred_uni()
|
|
* - ihevc_weighted_pred_bi()
|
|
* - ihevc_weighted_pred_bi_default()
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
/*****************************************************************************/
|
|
/* File Includes */
|
|
/*****************************************************************************/
|
|
#include "ihevc_typedefs.h"
|
|
#include "ihevc_defs.h"
|
|
#include "ihevc_macros.h"
|
|
#include "ihevc_func_selector.h"
|
|
#include "ihevc_inter_pred.h"
|
|
#include "arm_neon.h"
|
|
|
|
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Does uni-weighted prediction on the array pointed by pi2_src and stores
|
|
* it at the location pointed by pi2_dst Assumptions : The function is
|
|
* optimized considering the fact Width and height are multiple of 2.
|
|
*
|
|
* @par Description:
|
|
* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
|
|
* offset
|
|
*
|
|
* @param[in] pi2_src
|
|
* Pointer to the source
|
|
*
|
|
* @param[out] pu1_dst
|
|
* Pointer to the destination
|
|
*
|
|
* @param[in] src_strd
|
|
* Source stride
|
|
*
|
|
* @param[in] dst_strd
|
|
* Destination stride
|
|
*
|
|
* @param[in] wgt0
|
|
* weight to be multiplied to the source
|
|
*
|
|
* @param[in] off0
|
|
* offset to be added after rounding and
|
|
*
|
|
* @param[in] shifting
|
|
*
|
|
*
|
|
* @param[in] shift
|
|
* (14 Bit depth) + log2_weight_denominator
|
|
*
|
|
* @param[in] lvl_shift
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] ht
|
|
* height of the source
|
|
*
|
|
* @param[in] wd
|
|
* width of the source
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 src_strd,
|
|
WORD32 dst_strd,
|
|
WORD32 wgt0,
|
|
WORD32 off0,
|
|
WORD32 shift,
|
|
WORD32 lvl_shift,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 row, col;
|
|
int16x4_t pi2_src_val1;
|
|
int16x4_t pi2_src_val2;
|
|
int32x4_t i4_tmp1_t;
|
|
int32x4_t i4_tmp2_t;
|
|
int32x4_t sto_res_tmp1;
|
|
uint16x4_t sto_res_tmp2;
|
|
uint16x8_t sto_res_tmp3;
|
|
uint8x8_t sto_res;
|
|
int32x4_t tmp_lvl_shift_t;
|
|
WORD32 tmp_shift = 0 - shift;
|
|
int32x4_t tmp_shift_t;
|
|
WORD16 *pi2_src_tmp;
|
|
UWORD8 *pu1_dst_tmp;
|
|
|
|
WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
|
|
tmp_lvl_shift += (1 << (shift - 1));
|
|
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
|
|
tmp_shift_t = vmovq_n_s32(tmp_shift);
|
|
|
|
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
|
|
/* height has also been unrolled, hence 2 rows will processed at a time */
|
|
/* store also has been taken care for two row process */
|
|
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
|
|
/* saturated and narrowed */
|
|
|
|
for(row = ht; row > 0; row -= 2)
|
|
{
|
|
for(col = wd; col > 0; col -= 4)
|
|
{
|
|
pi2_src_tmp = pi2_src + src_strd;
|
|
|
|
pu1_dst_tmp = pu1_dst + dst_strd;
|
|
|
|
pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
|
|
pi2_src += 4;
|
|
|
|
pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
|
|
i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
|
|
|
|
i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
|
|
i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
|
|
i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
|
|
pu1_dst += 4;
|
|
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
|
|
}
|
|
pi2_src += 2 * src_strd - wd;
|
|
pu1_dst += 2 * dst_strd - wd;
|
|
}
|
|
}
|
|
//WEIGHTED_PRED_UNI
|
|
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Chroma uni-weighted prediction on the array pointed by pi2_src and stores
|
|
* it at the location pointed by pi2_dst Assumptions : The function is
|
|
* optimized considering the fact Width and height are multiple of 2.
|
|
*
|
|
* @par Description:
|
|
* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
|
|
* offset
|
|
*
|
|
* @param[in] pi2_src
|
|
* Pointer to the source
|
|
*
|
|
* @param[out] pu1_dst
|
|
* Pointer to the destination
|
|
*
|
|
* @param[in] src_strd
|
|
* Source stride
|
|
*
|
|
* @param[in] dst_strd
|
|
* Destination stride
|
|
*
|
|
* @param[in] wgt0
|
|
* weight to be multiplied to the source
|
|
*
|
|
* @param[in] off0
|
|
* offset to be added after rounding and
|
|
*
|
|
* @param[in] shifting
|
|
*
|
|
*
|
|
* @param[in] shift
|
|
* (14 Bit depth) + log2_weight_denominator
|
|
*
|
|
* @param[in] lvl_shift
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] ht
|
|
* height of the source
|
|
*
|
|
* @param[in] wd
|
|
* width of the source
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 src_strd,
|
|
WORD32 dst_strd,
|
|
WORD32 wgt0_cb,
|
|
WORD32 wgt0_cr,
|
|
WORD32 off0_cb,
|
|
WORD32 off0_cr,
|
|
WORD32 shift,
|
|
WORD32 lvl_shift,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 row, col;
|
|
int16x4_t pi2_src_val1;
|
|
int16x4_t pi2_src_val2;
|
|
int32x4_t i4_tmp1_t;
|
|
int32x4_t i4_tmp2_t;
|
|
int32x4_t sto_res_tmp1;
|
|
uint16x4_t sto_res_tmp2;
|
|
uint16x8_t sto_res_tmp3;
|
|
uint8x8_t sto_res;
|
|
int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
|
|
int32x4x2_t tmp_lvl_shift_t;
|
|
WORD32 tmp_shift = 0 - shift;
|
|
int32x4_t tmp_shift_t;
|
|
int16x4_t tmp_wgt0_u, tmp_wgt0_v;
|
|
int16x4x2_t wgt0;
|
|
WORD16 *pi2_src_tmp;
|
|
UWORD8 *pu1_dst_tmp;
|
|
|
|
WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
|
|
tmp_lvl_shift += (1 << (shift - 1));
|
|
tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
|
|
|
|
tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
|
|
tmp_lvl_shift += (1 << (shift - 1));
|
|
tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
|
|
|
|
tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
|
|
|
|
tmp_shift_t = vmovq_n_s32(tmp_shift);
|
|
|
|
tmp_wgt0_u = vdup_n_s16(wgt0_cb);
|
|
tmp_wgt0_v = vdup_n_s16(wgt0_cr);
|
|
wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
|
|
|
|
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
|
|
/* height has also been unrolled, hence 2 rows will processed at a time */
|
|
/* store also has been taken care for two row process */
|
|
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
|
|
/* saturated and narrowed */
|
|
|
|
for(row = ht; row > 0; row -= 2)
|
|
{
|
|
for(col = 2 * wd; col > 0; col -= 4)
|
|
{
|
|
pi2_src_tmp = pi2_src + src_strd;
|
|
|
|
pu1_dst_tmp = pu1_dst + dst_strd;
|
|
|
|
pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
|
|
pi2_src += 4;
|
|
|
|
pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
|
|
i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
|
|
|
|
i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
|
|
i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
|
|
i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
|
|
pu1_dst += 4;
|
|
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
|
|
}
|
|
pi2_src += 2 * src_strd - 2 * wd;
|
|
pu1_dst += 2 * dst_strd - 2 * wd;
|
|
}
|
|
}
|
|
//WEIGHTED_PRED_CHROMA_UNI
|
|
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Does bi-weighted prediction on the arrays pointed by pi2_src1 and
|
|
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
|
|
* function is optimized considering the fact Width and height are multiple
|
|
* of 2.
|
|
*
|
|
* @par Description:
|
|
* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
|
|
* off1 + 1) << (shift - 1) ) >> shift
|
|
*
|
|
* @param[in] pi2_src1
|
|
* Pointer to source 1
|
|
*
|
|
* @param[in] pi2_src2
|
|
* Pointer to source 2
|
|
*
|
|
* @param[out] pu1_dst
|
|
* Pointer to destination
|
|
*
|
|
* @param[in] src_strd1
|
|
* Source stride 1
|
|
*
|
|
* @param[in] src_strd2
|
|
* Source stride 2
|
|
*
|
|
* @param[in] dst_strd
|
|
* Destination stride
|
|
*
|
|
* @param[in] wgt0
|
|
* weight to be multiplied to source 1
|
|
*
|
|
* @param[in] off0
|
|
* offset 0
|
|
*
|
|
* @param[in] wgt1
|
|
* weight to be multiplied to source 2
|
|
*
|
|
* @param[in] off1
|
|
* offset 1
|
|
*
|
|
* @param[in] shift
|
|
* (14 Bit depth) + log2_weight_denominator
|
|
*
|
|
* @param[in] lvl_shift1
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] lvl_shift2
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] ht
|
|
* height of the source
|
|
*
|
|
* @param[in] wd
|
|
* width of the source
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
|
|
WORD16 *pi2_src2,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 src_strd1,
|
|
WORD32 src_strd2,
|
|
WORD32 dst_strd,
|
|
WORD32 wgt0,
|
|
WORD32 off0,
|
|
WORD32 wgt1,
|
|
WORD32 off1,
|
|
WORD32 shift,
|
|
WORD32 lvl_shift1,
|
|
WORD32 lvl_shift2,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 row, col;
|
|
int16x4_t pi2_src1_val1;
|
|
int16x4_t pi2_src1_val2;
|
|
int16x4_t pi2_src2_val1;
|
|
int16x4_t pi2_src2_val2;
|
|
int32x4_t i4_tmp1_t1;
|
|
int32x4_t i4_tmp1_t2;
|
|
int32x4_t i4_tmp2_t1;
|
|
int32x4_t i4_tmp2_t2;
|
|
int32x4_t sto_res_tmp1;
|
|
uint16x4_t sto_res_tmp2;
|
|
uint16x8_t sto_res_tmp3;
|
|
uint8x8_t sto_res;
|
|
int32x4_t tmp_lvl_shift_t;
|
|
WORD32 tmp_shift = 0 - shift;
|
|
int32x4_t tmp_shift_t;
|
|
WORD16 *pi2_src_tmp1;
|
|
WORD16 *pi2_src_tmp2;
|
|
UWORD8 *pu1_dst_tmp;
|
|
|
|
WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
|
|
tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
|
|
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
|
|
tmp_shift_t = vmovq_n_s32(tmp_shift);
|
|
|
|
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
|
|
/* height has also been unrolled, hence 2 rows will processed at a time */
|
|
/* store also has been taken care for two row process */
|
|
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
|
|
/* saturated and narrowed */
|
|
|
|
for(row = ht; row > 0; row -= 2)
|
|
{
|
|
for(col = wd; col > 0; col -= 4)
|
|
{
|
|
pi2_src_tmp1 = pi2_src1 + src_strd1;
|
|
pi2_src_tmp2 = pi2_src2 + src_strd2;
|
|
|
|
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
|
|
pi2_src1 += 4;
|
|
pu1_dst_tmp = pu1_dst + dst_strd;
|
|
|
|
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
|
|
pi2_src2 += 4;
|
|
i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
|
|
|
|
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
|
|
i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
|
|
|
|
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
|
|
|
|
i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
|
|
|
|
i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
|
|
pu1_dst += 4;
|
|
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
|
|
}
|
|
pi2_src1 += 2 * src_strd1 - wd;
|
|
pi2_src2 += 2 * src_strd2 - wd;
|
|
pu1_dst += 2 * dst_strd - wd;
|
|
}
|
|
}
|
|
//WEIGHTED_PRED_BI
|
|
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
|
|
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
|
|
* function is optimized considering the fact Width and height are multiple
|
|
* of 2.
|
|
*
|
|
* @par Description:
|
|
* dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
|
|
* off1 + 1) << (shift - 1) ) >> shift
|
|
*
|
|
* @param[in] pi2_src1
|
|
* Pointer to source 1
|
|
*
|
|
* @param[in] pi2_src2
|
|
* Pointer to source 2
|
|
*
|
|
* @param[out] pu1_dst
|
|
* Pointer to destination
|
|
*
|
|
* @param[in] src_strd1
|
|
* Source stride 1
|
|
*
|
|
* @param[in] src_strd2
|
|
* Source stride 2
|
|
*
|
|
* @param[in] dst_strd
|
|
* Destination stride
|
|
*
|
|
* @param[in] wgt0
|
|
* weight to be multiplied to source 1
|
|
*
|
|
* @param[in] off0
|
|
* offset 0
|
|
*
|
|
* @param[in] wgt1
|
|
* weight to be multiplied to source 2
|
|
*
|
|
* @param[in] off1
|
|
* offset 1
|
|
*
|
|
* @param[in] shift
|
|
* (14 Bit depth) + log2_weight_denominator
|
|
*
|
|
* @param[in] lvl_shift1
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] lvl_shift2
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] ht
|
|
* height of the source
|
|
*
|
|
* @param[in] wd
|
|
* width of the source
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
|
|
WORD16 *pi2_src2,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 src_strd1,
|
|
WORD32 src_strd2,
|
|
WORD32 dst_strd,
|
|
WORD32 wgt0_cb,
|
|
WORD32 wgt0_cr,
|
|
WORD32 off0_cb,
|
|
WORD32 off0_cr,
|
|
WORD32 wgt1_cb,
|
|
WORD32 wgt1_cr,
|
|
WORD32 off1_cb,
|
|
WORD32 off1_cr,
|
|
WORD32 shift,
|
|
WORD32 lvl_shift1,
|
|
WORD32 lvl_shift2,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 row, col;
|
|
int16x4_t pi2_src1_val1;
|
|
int16x4_t pi2_src1_val2;
|
|
int16x4_t pi2_src2_val1;
|
|
int16x4_t pi2_src2_val2;
|
|
int32x4_t i4_tmp1_t1;
|
|
int32x4_t i4_tmp1_t2;
|
|
int32x4_t i4_tmp2_t1;
|
|
int32x4_t i4_tmp2_t2;
|
|
int32x4_t sto_res_tmp1;
|
|
uint16x4_t sto_res_tmp2;
|
|
uint16x8_t sto_res_tmp3;
|
|
uint8x8_t sto_res;
|
|
int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
|
|
int32x4x2_t tmp_lvl_shift_t;
|
|
WORD32 tmp_shift = 0 - shift;
|
|
int32x4_t tmp_shift_t;
|
|
int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
|
|
int16x4x2_t wgt0, wgt1;
|
|
WORD16 *pi2_src_tmp1;
|
|
WORD16 *pi2_src_tmp2;
|
|
UWORD8 *pu1_dst_tmp;
|
|
|
|
WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
|
|
tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
|
|
tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
|
|
|
|
tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
|
|
tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
|
|
tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
|
|
|
|
tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
|
|
|
|
tmp_shift_t = vmovq_n_s32(tmp_shift);
|
|
|
|
tmp_wgt0_u = vdup_n_s16(wgt0_cb);
|
|
tmp_wgt0_v = vdup_n_s16(wgt0_cr);
|
|
wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
|
|
tmp_wgt1_u = vdup_n_s16(wgt1_cb);
|
|
tmp_wgt1_v = vdup_n_s16(wgt1_cr);
|
|
wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
|
|
|
|
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
|
|
/* height has also been unrolled, hence 2 rows will processed at a time */
|
|
/* store also has been taken care for two row process */
|
|
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
|
|
/* saturated and narrowed */
|
|
|
|
for(row = ht; row > 0; row -= 2)
|
|
{
|
|
for(col = 2 * wd; col > 0; col -= 4)
|
|
{
|
|
pi2_src_tmp1 = pi2_src1 + src_strd1;
|
|
pi2_src_tmp2 = pi2_src2 + src_strd2;
|
|
|
|
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
|
|
pi2_src1 += 4;
|
|
pu1_dst_tmp = pu1_dst + dst_strd;
|
|
|
|
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
|
|
pi2_src2 += 4;
|
|
i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
|
|
|
|
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
|
|
i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
|
|
|
|
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
|
|
|
|
i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
|
|
|
|
i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
|
|
pu1_dst += 4;
|
|
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
|
|
}
|
|
pi2_src1 += 2 * src_strd1 - 2 * wd;
|
|
pi2_src2 += 2 * src_strd2 - 2 * wd;
|
|
pu1_dst += 2 * dst_strd - 2 * wd;
|
|
}
|
|
}
|
|
//WEIGHTED_PRED_CHROMA_BI
|
|
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
|
|
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
|
|
* function is optimized considering the fact Width and height are multiple
|
|
* of 2.
|
|
*
|
|
* @par Description:
|
|
* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
|
|
* >> shift where shift = 15 - BitDepth
|
|
*
|
|
* @param[in] pi2_src1
|
|
* Pointer to source 1
|
|
*
|
|
* @param[in] pi2_src2
|
|
* Pointer to source 2
|
|
*
|
|
* @param[out] pu1_dst
|
|
* Pointer to destination
|
|
*
|
|
* @param[in] src_strd1
|
|
* Source stride 1
|
|
*
|
|
* @param[in] src_strd2
|
|
* Source stride 2
|
|
*
|
|
* @param[in] dst_strd
|
|
* Destination stride
|
|
*
|
|
* @param[in] lvl_shift1
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] lvl_shift2
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] ht
|
|
* height of the source
|
|
*
|
|
* @param[in] wd
|
|
* width of the source
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
|
|
WORD16 *pi2_src2,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 src_strd1,
|
|
WORD32 src_strd2,
|
|
WORD32 dst_strd,
|
|
WORD32 lvl_shift1,
|
|
WORD32 lvl_shift2,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 row, col;
|
|
int16x4_t pi2_src1_val1;
|
|
int16x4_t pi2_src1_val2;
|
|
int16x4_t pi2_src2_val1;
|
|
int16x4_t pi2_src2_val2;
|
|
int32x4_t i4_tmp1_t1;
|
|
int32x4_t i4_tmp1_t2;
|
|
int32x4_t i4_tmp2_t1;
|
|
int32x4_t i4_tmp2_t2;
|
|
int32x4_t sto_res_tmp1;
|
|
uint16x4_t sto_res_tmp2;
|
|
uint16x8_t sto_res_tmp3;
|
|
uint8x8_t sto_res;
|
|
int32x4_t tmp_lvl_shift_t;
|
|
int32x4_t tmp_shift_t;
|
|
WORD16 *pi2_src_tmp1;
|
|
WORD16 *pi2_src_tmp2;
|
|
UWORD8 *pu1_dst_tmp;
|
|
WORD32 shift;
|
|
|
|
shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
|
|
WORD32 tmp_shift = 0 - shift;
|
|
WORD32 tmp_lvl_shift = 1 << (shift - 1);
|
|
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
|
|
tmp_shift_t = vmovq_n_s32(tmp_shift);
|
|
|
|
int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
|
|
int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
|
|
|
|
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
|
|
/* height has also been unrolled, hence 2 rows will processed at a time */
|
|
/* store also has been taken care for two row process */
|
|
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
|
|
/* saturated and narrowed */
|
|
|
|
for(row = ht; row > 0; row -= 2)
|
|
{
|
|
for(col = wd; col > 0; col -= 4)
|
|
{
|
|
pi2_src_tmp1 = pi2_src1 + src_strd1;
|
|
pi2_src_tmp2 = pi2_src2 + src_strd2;
|
|
|
|
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
|
|
pi2_src1 += 4;
|
|
pu1_dst_tmp = pu1_dst + dst_strd;
|
|
|
|
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
|
|
pi2_src2 += 4;
|
|
i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
|
|
|
|
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
|
|
i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
|
|
|
|
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
|
|
|
|
i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
|
|
|
|
i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
|
|
pu1_dst += 4;
|
|
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
|
|
}
|
|
pi2_src1 += 2 * src_strd1 - wd;
|
|
pi2_src2 += 2 * src_strd2 - wd;
|
|
pu1_dst += 2 * dst_strd - wd;
|
|
}
|
|
}
|
|
//WEIGHTED_PRED_BI_DEFAULT
|
|
|
|
/**
|
|
*******************************************************************************
|
|
*
|
|
* @brief
|
|
* Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
|
|
* pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
|
|
* function is optimized considering the fact Width and height are multiple
|
|
* of 2.
|
|
*
|
|
* @par Description:
|
|
* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
|
|
* >> shift where shift = 15 - BitDepth
|
|
*
|
|
* @param[in] pi2_src1
|
|
* Pointer to source 1
|
|
*
|
|
* @param[in] pi2_src2
|
|
* Pointer to source 2
|
|
*
|
|
* @param[out] pu1_dst
|
|
* Pointer to destination
|
|
*
|
|
* @param[in] src_strd1
|
|
* Source stride 1
|
|
*
|
|
* @param[in] src_strd2
|
|
* Source stride 2
|
|
*
|
|
* @param[in] dst_strd
|
|
* Destination stride
|
|
*
|
|
* @param[in] lvl_shift1
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] lvl_shift2
|
|
* added before shift and offset
|
|
*
|
|
* @param[in] ht
|
|
* height of the source
|
|
*
|
|
* @param[in] wd
|
|
* width of the source
|
|
*
|
|
* @returns
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
|
|
WORD16 *pi2_src2,
|
|
UWORD8 *pu1_dst,
|
|
WORD32 src_strd1,
|
|
WORD32 src_strd2,
|
|
WORD32 dst_strd,
|
|
WORD32 lvl_shift1,
|
|
WORD32 lvl_shift2,
|
|
WORD32 ht,
|
|
WORD32 wd)
|
|
{
|
|
WORD32 row, col;
|
|
int16x4_t pi2_src1_val1;
|
|
int16x4_t pi2_src1_val2;
|
|
int16x4_t pi2_src2_val1;
|
|
int16x4_t pi2_src2_val2;
|
|
int32x4_t i4_tmp1_t1;
|
|
int32x4_t i4_tmp1_t2;
|
|
int32x4_t i4_tmp2_t1;
|
|
int32x4_t i4_tmp2_t2;
|
|
int32x4_t sto_res_tmp1;
|
|
uint16x4_t sto_res_tmp2;
|
|
uint16x8_t sto_res_tmp3;
|
|
uint8x8_t sto_res;
|
|
int32x4_t tmp_lvl_shift_t;
|
|
int32x4_t tmp_shift_t;
|
|
WORD16 *pi2_src_tmp1;
|
|
WORD16 *pi2_src_tmp2;
|
|
UWORD8 *pu1_dst_tmp;
|
|
WORD32 shift;
|
|
WORD32 tmp_shift;
|
|
WORD32 tmp_lvl_shift;
|
|
int16x4_t lvl_shift1_t;
|
|
int16x4_t lvl_shift2_t;
|
|
shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
|
|
tmp_shift = 0 - shift;
|
|
tmp_lvl_shift = 1 << (shift - 1);
|
|
tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
|
|
tmp_shift_t = vmovq_n_s32(tmp_shift);
|
|
|
|
lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
|
|
lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
|
|
|
|
/* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
|
|
/* height has also been unrolled, hence 2 rows will processed at a time */
|
|
/* store also has been taken care for two row process */
|
|
/* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
|
|
/* saturated and narrowed */
|
|
|
|
for(row = ht; row > 0; row -= 2)
|
|
{
|
|
for(col = 2 * wd; col > 0; col -= 4)
|
|
{
|
|
pi2_src_tmp1 = pi2_src1 + src_strd1;
|
|
pi2_src_tmp2 = pi2_src2 + src_strd2;
|
|
|
|
pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
|
|
pi2_src1 += 4;
|
|
pu1_dst_tmp = pu1_dst + dst_strd;
|
|
|
|
pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
|
|
pi2_src2 += 4;
|
|
i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
|
|
|
|
pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
|
|
i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
|
|
|
|
pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
|
|
|
|
i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
|
|
i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
|
|
|
|
i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
|
|
i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
|
|
sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
|
|
sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
|
|
|
|
vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
|
|
pu1_dst += 4;
|
|
|
|
sto_res = vqmovn_u16(sto_res_tmp3);
|
|
vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
|
|
}
|
|
pi2_src1 += 2 * src_strd1 - 2 * wd;
|
|
pi2_src2 += 2 * src_strd2 - 2 * wd;
|
|
pu1_dst += 2 * dst_strd - 2 * wd;
|
|
}
|
|
}
|
|
//WEIGHTED_PRED_CHROMA_BI_DEFAULT
|