271 lines
10 KiB
C
271 lines
10 KiB
C
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
******************************************************************************/
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* ihevcd_frm_cvt_x86_intr.c
|
|
*
|
|
* @brief
|
|
* Platform specific intrinsic implementation of certain functions
|
|
*
|
|
* @author
|
|
* Ittiam
|
|
* @par List of Functions:
|
|
* - ihevcd_itrans_recon_dc
|
|
* - ihevcd_fmt_conv_420sp_to_420p
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
#include "string.h"
|
|
#include "ihevc_typedefs.h"
|
|
#include "ihevc_defs.h"
|
|
#include "ihevc_macros.h"
|
|
#include "ihevc_platform_macros.h"
|
|
#include "ihevcd_function_selector.h"
|
|
#include <string.h>
|
|
#include <immintrin.h>
|
|
|
|
|
|
void ihevcd_fmt_conv_420sp_to_420p_ssse3(UWORD8 *pu1_y_src,
|
|
UWORD8 *pu1_uv_src,
|
|
UWORD8 *pu1_y_dst,
|
|
UWORD8 *pu1_u_dst,
|
|
UWORD8 *pu1_v_dst,
|
|
WORD32 wd,
|
|
WORD32 ht,
|
|
WORD32 src_y_strd,
|
|
WORD32 src_uv_strd,
|
|
WORD32 dst_y_strd,
|
|
WORD32 dst_uv_strd,
|
|
WORD32 is_u_first,
|
|
WORD32 disable_luma_copy)
|
|
{
|
|
UWORD8 *pu1_src, *pu1_dst;
|
|
UWORD8 *pu1_u_src, *pu1_v_src;
|
|
WORD32 num_rows, num_cols, src_strd, dst_strd, cols, rows;
|
|
WORD32 i, j;
|
|
|
|
cols = 0;
|
|
pu1_u_src = (UWORD8 *)pu1_uv_src;
|
|
pu1_v_src = (UWORD8 *)pu1_uv_src + 1;
|
|
if(0 == disable_luma_copy)
|
|
{
|
|
/* copy luma */
|
|
pu1_src = (UWORD8 *)pu1_y_src;
|
|
pu1_dst = (UWORD8 *)pu1_y_dst;
|
|
|
|
num_rows = ht;
|
|
num_cols = wd;
|
|
|
|
src_strd = src_y_strd;
|
|
dst_strd = dst_y_strd;
|
|
for(i = 0; i < num_rows; i++)
|
|
{
|
|
memcpy(pu1_dst, pu1_src, num_cols);
|
|
pu1_dst += dst_strd;
|
|
pu1_src += src_strd;
|
|
}
|
|
}
|
|
|
|
/* de-interleave U and V and copy to destination */
|
|
if(!is_u_first)
|
|
{
|
|
UWORD8 *temp = pu1_u_dst;
|
|
pu1_u_dst = pu1_v_dst;
|
|
pu1_v_dst = temp;
|
|
|
|
pu1_u_src = (UWORD8 *)pu1_uv_src + 1;
|
|
pu1_v_src = (UWORD8 *)pu1_uv_src;
|
|
}
|
|
|
|
{
|
|
__m128i src_uv0_8x16b, src_uv1_8x16b, src_u_8x16b, src_v_8x16b;
|
|
__m128i temp0_8x16b, temp1_8x16b, alt_first_mask;
|
|
|
|
UWORD8 FIRST_ALT_SHUFFLE[16] = {
|
|
0x00, 0x02, 0x04, 0x06,
|
|
0x08, 0x0A, 0x0C, 0x0E,
|
|
0x01, 0x03, 0x05, 0x07,
|
|
0x09, 0x0B, 0x0D, 0x0F };
|
|
|
|
PREFETCH((char const *)(pu1_uv_src + (0 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (1 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (2 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (3 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (4 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (5 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (6 * src_uv_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (7 * src_uv_strd)), _MM_HINT_T0)
|
|
|
|
num_rows = ht >> 1;
|
|
num_cols = wd >> 1;
|
|
|
|
src_strd = src_uv_strd;
|
|
dst_strd = dst_uv_strd;
|
|
|
|
alt_first_mask = _mm_loadu_si128((__m128i *)&FIRST_ALT_SHUFFLE[0]);
|
|
|
|
if(num_cols > 15)
|
|
{
|
|
cols = num_cols >> 4;
|
|
|
|
for(i = 0; i < (num_rows >> 2); i++)
|
|
{
|
|
UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
|
|
|
|
PREFETCH((char const *)(pu1_uv_src + (8 * src_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (9 * src_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (10 * src_strd)), _MM_HINT_T0)
|
|
PREFETCH((char const *)(pu1_uv_src + (11 * src_strd)), _MM_HINT_T0)
|
|
|
|
pu1_uv_src_temp = pu1_uv_src;
|
|
pu1_u_dst_temp = pu1_u_dst;
|
|
pu1_v_dst_temp = pu1_v_dst;
|
|
|
|
for(j = 0; j < cols; j++)
|
|
{
|
|
|
|
/**** Row 0 ***/
|
|
src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
|
|
src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
|
|
|
|
temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
|
|
temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
|
|
|
|
src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
|
|
src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
|
|
|
|
_mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
|
|
_mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
|
|
|
|
/**** Row 1 ***/
|
|
src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd)));
|
|
src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (1 * src_strd) + 16));
|
|
|
|
temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
|
|
temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
|
|
|
|
src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
|
|
src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
|
|
|
|
_mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (1 * dst_strd)), src_u_8x16b);
|
|
_mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (1 * dst_strd)), src_v_8x16b);
|
|
|
|
/**** Row 2 ***/
|
|
src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd)));
|
|
src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (2 * src_strd) + 16));
|
|
|
|
temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
|
|
temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
|
|
|
|
src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
|
|
src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
|
|
|
|
_mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (2 * dst_strd)), src_u_8x16b);
|
|
_mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (2 * dst_strd)), src_v_8x16b);
|
|
|
|
/**** Row 3 ***/
|
|
src_uv0_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd)));
|
|
src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + (3 * src_strd) + 16));
|
|
|
|
temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
|
|
temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
|
|
|
|
src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
|
|
src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
|
|
|
|
_mm_storeu_si128((__m128i *)(pu1_u_dst_temp + (3 * dst_strd)), src_u_8x16b);
|
|
_mm_storeu_si128((__m128i *)(pu1_v_dst_temp + (3 * dst_strd)), src_v_8x16b);
|
|
|
|
pu1_u_dst_temp += 16;
|
|
pu1_v_dst_temp += 16;
|
|
pu1_uv_src_temp += 32;
|
|
}
|
|
|
|
pu1_u_dst += 4 * dst_strd;
|
|
pu1_v_dst += 4 * dst_strd;
|
|
pu1_uv_src += 4 * src_strd;
|
|
//pu1_v_src += src_strd;
|
|
}
|
|
rows = num_rows & 0x3;
|
|
if(rows)
|
|
{
|
|
for(i = 0; i < rows; i++)
|
|
{
|
|
UWORD8 *pu1_uv_src_temp, *pu1_u_dst_temp, *pu1_v_dst_temp;
|
|
|
|
pu1_uv_src_temp = pu1_uv_src;
|
|
pu1_u_dst_temp = pu1_u_dst;
|
|
pu1_v_dst_temp = pu1_v_dst;
|
|
|
|
for(j = 0; j < cols; j++)
|
|
{
|
|
|
|
src_uv0_8x16b = _mm_loadu_si128((__m128i *)pu1_uv_src_temp);
|
|
src_uv1_8x16b = _mm_loadu_si128((__m128i *)(pu1_uv_src_temp + 16));
|
|
|
|
temp0_8x16b = _mm_shuffle_epi8(src_uv0_8x16b, alt_first_mask);
|
|
temp1_8x16b = _mm_shuffle_epi8(src_uv1_8x16b, alt_first_mask);
|
|
|
|
src_u_8x16b = _mm_unpacklo_epi64(temp0_8x16b, temp1_8x16b);
|
|
src_v_8x16b = _mm_unpackhi_epi64(temp0_8x16b, temp1_8x16b);
|
|
|
|
_mm_storeu_si128((__m128i *)(pu1_u_dst_temp), src_u_8x16b);
|
|
_mm_storeu_si128((__m128i *)(pu1_v_dst_temp), src_v_8x16b);
|
|
|
|
pu1_u_dst_temp += 16;
|
|
pu1_v_dst_temp += 16;
|
|
pu1_uv_src_temp += 32;
|
|
}
|
|
|
|
pu1_u_dst += dst_strd;
|
|
pu1_v_dst += dst_strd;
|
|
pu1_uv_src += src_strd;
|
|
}
|
|
}
|
|
pu1_u_dst -= (num_rows * dst_strd);
|
|
pu1_v_dst -= (num_rows * dst_strd);
|
|
num_cols &= 0x0F;
|
|
}
|
|
if(num_cols)
|
|
{
|
|
pu1_u_dst += (cols << 4);
|
|
pu1_v_dst += (cols << 4);
|
|
pu1_u_src += 2 * (cols << 4);
|
|
pu1_v_src += 2 * (cols << 4);
|
|
for(i = 0; i < num_rows; i++)
|
|
{
|
|
for(j = 0; j < num_cols; j++)
|
|
{
|
|
pu1_u_dst[j] = pu1_u_src[j * 2];
|
|
pu1_v_dst[j] = pu1_v_src[j * 2];
|
|
}
|
|
|
|
pu1_u_dst += dst_strd;
|
|
pu1_v_dst += dst_strd;
|
|
pu1_u_src += src_strd;
|
|
pu1_v_src += src_strd;
|
|
}
|
|
}
|
|
}
|
|
return;
|
|
}
|