900 lines
38 KiB
C
900 lines
38 KiB
C
/******************************************************************************
|
|
*
|
|
* Copyright (C) 2015 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at:
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*****************************************************************************
|
|
* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
*/
|
|
|
|
/**
|
|
*******************************************************************************
|
|
* @file
|
|
* impeg2_inter_pred_sse42_intr.c
|
|
*
|
|
* @brief
|
|
* Contains Motion compensation function definitions for MPEG2 decoder
|
|
*
|
|
* @author
|
|
* Mohit [100664]
|
|
*
|
|
* - impeg2_copy_mb_sse42()
|
|
* - impeg2_interpolate_sse42()
|
|
* - impeg2_mc_halfx_halfy_8x8_sse42()
|
|
* - impeg2_mc_halfx_fully_8x8_sse42()
|
|
* - impeg2_mc_fullx_halfy_8x8_sse42()
|
|
* - impeg2_mc_fullx_fully_8x8_sse42()
|
|
*
|
|
* @remarks
|
|
* None
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include "iv_datatypedef.h"
|
|
#include "impeg2_macros.h"
|
|
#include "impeg2_defs.h"
|
|
#include "impeg2_inter_pred.h"
|
|
|
|
#include <immintrin.h>
|
|
#include <emmintrin.h>
|
|
#include <smmintrin.h>
|
|
#include <tmmintrin.h>
|
|
|
|
/*******************************************************************************
|
|
* Function Name : impeg2_copy_mb
|
|
*
|
|
* Description : copies 3 components to the frame from mc_buf
|
|
*
|
|
* Arguments :
|
|
* src_buf : Source Buffer
|
|
* dst_buf : Destination Buffer
|
|
* src_wd : Source Width
|
|
* dst_wd : destination Width
|
|
*
|
|
* Values Returned : None
|
|
*******************************************************************************/
|
|
void impeg2_copy_mb_sse42(yuv_buf_t *src_buf,
|
|
yuv_buf_t *dst_buf,
|
|
UWORD32 src_wd,
|
|
UWORD32 dst_wd)
|
|
{
|
|
UWORD8 *src;
|
|
UWORD8 *dst;
|
|
__m128i src_r0, src_r1, src_r2, src_r3;
|
|
|
|
/*******************************************************/
|
|
/* copy Y */
|
|
/*******************************************************/
|
|
src = src_buf->pu1_y;
|
|
dst = dst_buf->pu1_y;
|
|
// Row 0-3
|
|
src_r0 = _mm_loadu_si128((__m128i *) (src));
|
|
src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
|
|
src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
|
|
src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
|
|
|
|
// Row 4-7
|
|
src += 4 * src_wd;
|
|
dst += 4 * dst_wd;
|
|
src_r0 = _mm_loadu_si128((__m128i *) (src));
|
|
src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
|
|
src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
|
|
src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
|
|
|
|
// Row 8-11
|
|
src += 4 * src_wd;
|
|
dst += 4 * dst_wd;
|
|
src_r0 = _mm_loadu_si128((__m128i *) (src));
|
|
src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
|
|
src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
|
|
src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
|
|
|
|
// Row 12-15
|
|
src += 4 * src_wd;
|
|
dst += 4 * dst_wd;
|
|
src_r0 = _mm_loadu_si128((__m128i *) (src));
|
|
src_r1 = _mm_loadu_si128((__m128i *) (src + src_wd));
|
|
src_r2 = _mm_loadu_si128((__m128i *) (src + 2 * src_wd));
|
|
src_r3 = _mm_loadu_si128((__m128i *) (src + 3 * src_wd));
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + dst_wd), src_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * dst_wd), src_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * dst_wd), src_r3);
|
|
|
|
src_wd >>= 1;
|
|
dst_wd >>= 1;
|
|
|
|
/*******************************************************/
|
|
/* copy U */
|
|
/*******************************************************/
|
|
src = src_buf->pu1_u;
|
|
dst = dst_buf->pu1_u;
|
|
|
|
// Row 0-3
|
|
src_r0 = _mm_loadl_epi64((__m128i *)src);
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
|
|
src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
|
|
|
|
_mm_storel_epi64((__m128i *)dst, src_r0);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
|
|
_mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
|
|
_mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
|
|
|
|
// Row 4-7
|
|
src += 4 * src_wd;
|
|
dst += 4 * dst_wd;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *)src);
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
|
|
src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
|
|
|
|
_mm_storel_epi64((__m128i *)dst, src_r0);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
|
|
_mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
|
|
_mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
|
|
|
|
/*******************************************************/
|
|
/* copy V */
|
|
/*******************************************************/
|
|
src = src_buf->pu1_v;
|
|
dst = dst_buf->pu1_v;
|
|
// Row 0-3
|
|
src_r0 = _mm_loadl_epi64((__m128i *)src);
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
|
|
src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
|
|
|
|
_mm_storel_epi64((__m128i *)dst, src_r0);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
|
|
_mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
|
|
_mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
|
|
|
|
// Row 4-7
|
|
src += 4 * src_wd;
|
|
dst += 4 * dst_wd;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *)src);
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(src + src_wd));
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(src + 2 * src_wd));
|
|
src_r3 = _mm_loadl_epi64((__m128i *)(src + 3 * src_wd));
|
|
|
|
_mm_storel_epi64((__m128i *)dst, src_r0);
|
|
_mm_storel_epi64((__m128i *)(dst + dst_wd), src_r1);
|
|
_mm_storel_epi64((__m128i *)(dst + 2 * dst_wd), src_r2);
|
|
_mm_storel_epi64((__m128i *)(dst + 3 * dst_wd), src_r3);
|
|
}
|
|
|
|
/*****************************************************************************/
|
|
/* */
|
|
/* Function Name : impeg2_interpolate */
|
|
/* */
|
|
/* Description : averages the contents of buf_src1 and buf_src2 and stores*/
|
|
/* result in buf_dst */
|
|
/* */
|
|
/* Inputs : buf_src1 - First Source */
|
|
/* buf_src2 - Second Source */
|
|
/* */
|
|
/* Globals : None */
|
|
/* */
|
|
/* Processing : Avg the values from two sources and store the result in */
|
|
/* destination buffer */
|
|
/* */
|
|
/* Outputs : buf_dst - Avg of contents of buf_src1 and buf_src2 */
|
|
/* */
|
|
/* Returns : None */
|
|
/* */
|
|
/* Issues : Assumes that all 3 buffers are of same size */
|
|
/* */
|
|
/*****************************************************************************/
|
|
void impeg2_interpolate_sse42(yuv_buf_t *buf_src1,
|
|
yuv_buf_t *buf_src2,
|
|
yuv_buf_t *buf_dst,
|
|
UWORD32 stride)
|
|
{
|
|
UWORD8 *src1, *src2;
|
|
UWORD8 *dst;
|
|
__m128i src1_r0, src1_r1, src1_r2, src1_r3;
|
|
__m128i src2_r0, src2_r1, src2_r2, src2_r3;
|
|
|
|
/*******************************************************/
|
|
/* interpolate Y */
|
|
/*******************************************************/
|
|
src1 = buf_src1->pu1_y;
|
|
src2 = buf_src2->pu1_y;
|
|
dst = buf_dst->pu1_y;
|
|
// Row 0-3
|
|
src1_r0 = _mm_loadu_si128((__m128i *) (src1));
|
|
src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
|
|
src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
|
|
src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
|
|
|
|
src2_r0 = _mm_loadu_si128((__m128i *) (src2));
|
|
src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
|
|
src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
|
|
src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src1_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
// Row 4-7
|
|
src1 += 4 * 16;
|
|
src2 += 4 * 16;
|
|
dst += 4 * stride;
|
|
src1_r0 = _mm_loadu_si128((__m128i *) (src1));
|
|
src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
|
|
src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
|
|
src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
|
|
|
|
src2_r0 = _mm_loadu_si128((__m128i *) (src2));
|
|
src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
|
|
src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
|
|
src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src1_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
// Row 8-11
|
|
src1 += 4 * 16;
|
|
src2 += 4 * 16;
|
|
dst += 4 * stride;
|
|
src1_r0 = _mm_loadu_si128((__m128i *) (src1));
|
|
src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
|
|
src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
|
|
src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
|
|
|
|
src2_r0 = _mm_loadu_si128((__m128i *) (src2));
|
|
src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
|
|
src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
|
|
src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src1_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
// Row 12-15
|
|
src1 += 4 * 16;
|
|
src2 += 4 * 16;
|
|
dst += 4 * stride;
|
|
src1_r0 = _mm_loadu_si128((__m128i *) (src1));
|
|
src1_r1 = _mm_loadu_si128((__m128i *) (src1 + 16));
|
|
src1_r2 = _mm_loadu_si128((__m128i *) (src1 + 2 * 16));
|
|
src1_r3 = _mm_loadu_si128((__m128i *) (src1 + 3 * 16));
|
|
|
|
src2_r0 = _mm_loadu_si128((__m128i *) (src2));
|
|
src2_r1 = _mm_loadu_si128((__m128i *) (src2 + 16));
|
|
src2_r2 = _mm_loadu_si128((__m128i *) (src2 + 2 * 16));
|
|
src2_r3 = _mm_loadu_si128((__m128i *) (src2 + 3 * 16));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storeu_si128((__m128i *) dst, src1_r0);
|
|
_mm_storeu_si128((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storeu_si128((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storeu_si128((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
stride >>= 1;
|
|
|
|
/*******************************************************/
|
|
/* interpolate U */
|
|
/*******************************************************/
|
|
src1 = buf_src1->pu1_u;
|
|
src2 = buf_src2->pu1_u;
|
|
dst = buf_dst->pu1_u;
|
|
// Row 0-3
|
|
src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
|
|
src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
|
|
src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
|
|
src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
|
|
|
|
src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
|
|
src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
|
|
src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
|
|
src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storel_epi64((__m128i *) dst, src1_r0);
|
|
_mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
// Row 4-7
|
|
src1 += 4 * 8;
|
|
src2 += 4 * 8;
|
|
dst += 4 * stride;
|
|
|
|
src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
|
|
src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
|
|
src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
|
|
src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
|
|
|
|
src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
|
|
src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
|
|
src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
|
|
src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storel_epi64((__m128i *) dst, src1_r0);
|
|
_mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
/*******************************************************/
|
|
/* interpolate V */
|
|
/*******************************************************/
|
|
src1 = buf_src1->pu1_v;
|
|
src2 = buf_src2->pu1_v;
|
|
dst = buf_dst->pu1_v;
|
|
|
|
// Row 0-3
|
|
src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
|
|
src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
|
|
src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
|
|
src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
|
|
|
|
src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
|
|
src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
|
|
src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
|
|
src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storel_epi64((__m128i *) dst, src1_r0);
|
|
_mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
|
|
|
|
// Row 4-7
|
|
src1 += 4 * 8;
|
|
src2 += 4 * 8;
|
|
dst += 4 * stride;
|
|
|
|
src1_r0 = _mm_loadl_epi64((__m128i *) (src1));
|
|
src1_r1 = _mm_loadl_epi64((__m128i *) (src1 + 8));
|
|
src1_r2 = _mm_loadl_epi64((__m128i *) (src1 + 2 * 8));
|
|
src1_r3 = _mm_loadl_epi64((__m128i *) (src1 + 3 * 8));
|
|
|
|
src2_r0 = _mm_loadl_epi64((__m128i *) (src2));
|
|
src2_r1 = _mm_loadl_epi64((__m128i *) (src2 + 8));
|
|
src2_r2 = _mm_loadl_epi64((__m128i *) (src2 + 2 * 8));
|
|
src2_r3 = _mm_loadl_epi64((__m128i *) (src2 + 3 * 8));
|
|
|
|
src1_r0 = _mm_avg_epu8 (src1_r0, src2_r0);
|
|
src1_r1 = _mm_avg_epu8 (src1_r1, src2_r1);
|
|
src1_r2 = _mm_avg_epu8 (src1_r2, src2_r2);
|
|
src1_r3 = _mm_avg_epu8 (src1_r3, src2_r3);
|
|
|
|
_mm_storel_epi64((__m128i *) dst, src1_r0);
|
|
_mm_storel_epi64((__m128i *) (dst + stride), src1_r1);
|
|
_mm_storel_epi64((__m128i *) (dst + 2 * stride), src1_r2);
|
|
_mm_storel_epi64((__m128i *) (dst + 3 * stride), src1_r3);
|
|
}
|
|
|
|
/*****************************************************************************/
|
|
/* */
|
|
/* Function Name : impeg2_mc_halfx_halfy_8x8_sse42() */
|
|
/* */
|
|
/* Description : Gets the buffer from (0.5,0.5) to (8.5,8.5) */
|
|
/* and the above block of size 8 x 8 will be placed as a */
|
|
/* block from the current position of out_buf */
|
|
/* */
|
|
/* Inputs : ref - Reference frame from which the block will be */
|
|
/* block will be extracted. */
|
|
/* ref_wid - WIdth of reference frame */
|
|
/* out_wid - WIdth of the output frame */
|
|
/* blk_width - width of the block */
|
|
/* blk_width - height of the block */
|
|
/* */
|
|
/* Globals : None */
|
|
/* */
|
|
/* Processing : Point to the (0,0),(1,0),(0,1),(1,1) position in */
|
|
/* the ref frame.Interpolate these four values to get the */
|
|
/* value at(0.5,0.5).Repeat this to get an 8 x 8 block */
|
|
/* using 9 x 9 block from reference frame */
|
|
/* */
|
|
/* Outputs : out - Output containing the extracted block */
|
|
/* */
|
|
/* Returns : None */
|
|
/* */
|
|
/* Issues : None */
|
|
/* */
|
|
/*****************************************************************************/
|
|
void impeg2_mc_halfx_halfy_8x8_sse42(UWORD8 *out,
|
|
UWORD8 *ref,
|
|
UWORD32 ref_wid,
|
|
UWORD32 out_wid)
|
|
{
|
|
UWORD8 *ref_p0,*ref_p1,*ref_p2,*ref_p3;
|
|
/* P0-P3 are the pixels in the reference frame and Q is the value being */
|
|
/* estimated */
|
|
/*
|
|
P0 P1
|
|
Q
|
|
P2 P3
|
|
*/
|
|
__m128i src_r0, src_r0_1, src_r1, src_r1_1;
|
|
__m128i tmp0, tmp1;
|
|
__m128i value_2 = _mm_set1_epi16(2);
|
|
|
|
ref_p0 = ref;
|
|
ref_p1 = ref + 1;
|
|
ref_p2 = ref + ref_wid;
|
|
ref_p3 = ref + ref_wid + 1;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
|
|
src_r1 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 1
|
|
src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
src_r1 = _mm_cvtepu8_epi16(src_r1);
|
|
src_r1_1 = _mm_cvtepu8_epi16(src_r1_1);
|
|
|
|
tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 0 horizontal interpolation
|
|
tmp1 = _mm_add_epi16(src_r1, src_r1_1); //Row 1 horizontal interpolation
|
|
tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 0 vertical interpolation
|
|
tmp0 = _mm_add_epi16(tmp0, value_2);
|
|
tmp0 = _mm_srli_epi16(tmp0, 2);
|
|
tmp0 = _mm_packus_epi16(tmp0, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp0);
|
|
|
|
//Row 1
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 2
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 2 horizontal interpolation
|
|
tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 1 vertical interpolation
|
|
tmp1 = _mm_add_epi16(tmp1, value_2);
|
|
tmp1 = _mm_srli_epi16(tmp1, 2);
|
|
tmp1 = _mm_packus_epi16(tmp1, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp1);
|
|
|
|
//Row 2
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 3
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 3 horizontal interpolation
|
|
|
|
tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 2 vertical interpolation
|
|
tmp0 = _mm_add_epi16(tmp0, value_2);
|
|
tmp0 = _mm_srli_epi16(tmp0, 2);
|
|
tmp0 = _mm_packus_epi16(tmp0, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp0);
|
|
|
|
//Row 3
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 4
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 4 horizontal interpolation
|
|
|
|
tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 3 vertical interpolation
|
|
tmp1 = _mm_add_epi16(tmp1, value_2);
|
|
tmp1 = _mm_srli_epi16(tmp1, 2);
|
|
tmp1 = _mm_packus_epi16(tmp1, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp1);
|
|
|
|
//Row 4
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 5
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 5 horizontal interpolation
|
|
|
|
tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 4 vertical interpolation
|
|
tmp0 = _mm_add_epi16(tmp0, value_2);
|
|
tmp0 = _mm_srli_epi16(tmp0, 2);
|
|
tmp0 = _mm_packus_epi16(tmp0, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp0);
|
|
|
|
//Row 5
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 6
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 6 horizontal interpolation
|
|
|
|
tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 5 vertical interpolation
|
|
tmp1 = _mm_add_epi16(tmp1, value_2);
|
|
tmp1 = _mm_srli_epi16(tmp1, 2);
|
|
tmp1 = _mm_packus_epi16(tmp1, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp1);
|
|
|
|
//Row 6
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 7
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp1 = _mm_add_epi16(src_r0, src_r0_1); //Row 7 horizontal interpolation
|
|
|
|
tmp0 = _mm_add_epi16(tmp0, tmp1); //Row 6 vertical interpolation
|
|
tmp0 = _mm_add_epi16(tmp0, value_2);
|
|
tmp0 = _mm_srli_epi16(tmp0, 2);
|
|
tmp0 = _mm_packus_epi16(tmp0, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp0);
|
|
|
|
//Row 7
|
|
ref_p2 += ref_wid;
|
|
ref_p3 += ref_wid;
|
|
out += out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p2)); //Row 8
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p3));
|
|
|
|
src_r0 = _mm_cvtepu8_epi16(src_r0);
|
|
src_r0_1 = _mm_cvtepu8_epi16(src_r0_1);
|
|
|
|
tmp0 = _mm_add_epi16(src_r0, src_r0_1); //Row 8 horizontal interpolation
|
|
|
|
tmp1 = _mm_add_epi16(tmp0, tmp1); //Row 7 vertical interpolation
|
|
tmp1 = _mm_add_epi16(tmp1, value_2);
|
|
tmp1 = _mm_srli_epi16(tmp1, 2);
|
|
tmp1 = _mm_packus_epi16(tmp1, value_2);
|
|
|
|
_mm_storel_epi64((__m128i *)out, tmp1);
|
|
|
|
return;
|
|
}
|
|
|
|
/*****************************************************************************/
|
|
/* */
|
|
/* Function Name : impeg2_mc_halfx_fully_8x8_sse42() */
|
|
/* */
|
|
/* Description : Gets the buffer from (0.5,0) to (8.5,8) */
|
|
/* and the above block of size 8 x 8 will be placed as a */
|
|
/* block from the current position of out_buf */
|
|
/* */
|
|
/* Inputs : ref - Reference frame from which the block will be */
|
|
/* block will be extracted. */
|
|
/* ref_wid - WIdth of reference frame */
|
|
/* out_wid - WIdth of the output frame */
|
|
/* blk_width - width of the block */
|
|
/* blk_width - height of the block */
|
|
/* */
|
|
/* Globals : None */
|
|
/* */
|
|
/* Processing : Point to the (0,0) and (1,0) position in the ref frame */
|
|
/* Interpolate these two values to get the value at(0.5,0) */
|
|
/* Repeat this to get an 8 x 8 block using 9 x 8 block from */
|
|
/* reference frame */
|
|
/* */
|
|
/* Outputs : out - Output containing the extracted block */
|
|
/* */
|
|
/* Returns : None */
|
|
/* */
|
|
/* Issues : None */
|
|
/* */
|
|
/*****************************************************************************/
|
|
void impeg2_mc_halfx_fully_8x8_sse42(UWORD8 *out,
|
|
UWORD8 *ref,
|
|
UWORD32 ref_wid,
|
|
UWORD32 out_wid)
|
|
{
|
|
UWORD8 *ref_p0,*ref_p1;
|
|
__m128i src_r0, src_r0_1, src_r1, src_r1_1;
|
|
/* P0-P3 are the pixels in the reference frame and Q is the value being */
|
|
/* estimated */
|
|
/*
|
|
P0 Q P1
|
|
*/
|
|
|
|
ref_p0 = ref;
|
|
ref_p1 = ref + 1;
|
|
|
|
// Row 0 and 1
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 0
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
|
|
src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 1
|
|
src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
|
|
|
|
src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
|
|
src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
|
|
|
|
_mm_storel_epi64((__m128i *)out, src_r0);
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
|
|
|
|
// Row 2 and 3
|
|
ref_p0 += 2*ref_wid;
|
|
ref_p1 += 2*ref_wid;
|
|
out += 2*out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 2
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
|
|
src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 3
|
|
src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
|
|
|
|
src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
|
|
src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
|
|
|
|
_mm_storel_epi64((__m128i *)out, src_r0);
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
|
|
|
|
// Row 4 and 5
|
|
ref_p0 += 2*ref_wid;
|
|
ref_p1 += 2*ref_wid;
|
|
out += 2*out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 4
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
|
|
src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 5
|
|
src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
|
|
|
|
src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
|
|
src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
|
|
|
|
_mm_storel_epi64((__m128i *)out, src_r0);
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
|
|
|
|
// Row 6 and 7
|
|
ref_p0 += 2*ref_wid;
|
|
ref_p1 += 2*ref_wid;
|
|
out += 2*out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *) (ref_p0)); //Row 6
|
|
src_r0_1 = _mm_loadl_epi64((__m128i *) (ref_p1));
|
|
src_r1 = _mm_loadl_epi64((__m128i *) (ref_p0 + ref_wid)); //Row 7
|
|
src_r1_1 = _mm_loadl_epi64((__m128i *) (ref_p1 + ref_wid));
|
|
|
|
src_r0 = _mm_avg_epu8(src_r0, src_r0_1);
|
|
src_r1 = _mm_avg_epu8(src_r1, src_r1_1);
|
|
|
|
_mm_storel_epi64((__m128i *)out, src_r0);
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
|
|
|
|
return;
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
/* */
|
|
/* Function Name : impeg2_mc_fullx_halfy_8x8_sse42() */
|
|
/* */
|
|
/* Description : Gets the buffer from (0,0.5) to (8,8.5) */
|
|
/* and the above block of size 8 x 8 will be placed as a */
|
|
/* block from the current position of out_buf */
|
|
/* */
|
|
/* Inputs : ref - Reference frame from which the block will be */
|
|
/* block will be extracted. */
|
|
/* ref_wid - WIdth of reference frame */
|
|
/* out_wid - WIdth of the output frame */
|
|
/* blk_width - width of the block */
|
|
/* blk_width - height of the block */
|
|
/* */
|
|
/* Globals : None */
|
|
/* */
|
|
/* Processing : Point to the (0,0) and (0,1) position in the ref frame */
|
|
/* Interpolate these two values to get the value at(0,0.5) */
|
|
/* Repeat this to get an 8 x 8 block using 8 x 9 block from */
|
|
/* reference frame */
|
|
/* */
|
|
/* Outputs : out - Output containing the extracted block */
|
|
/* */
|
|
/* Returns : None */
|
|
/* */
|
|
/* Issues : None */
|
|
/* */
|
|
/*****************************************************************************/
|
|
void impeg2_mc_fullx_halfy_8x8_sse42(UWORD8 *out,
|
|
UWORD8 *ref,
|
|
UWORD32 ref_wid,
|
|
UWORD32 out_wid)
|
|
{
|
|
__m128i src_r0, src_r1, src_r2, temp0, temp1;
|
|
/* P0-P3 are the pixels in the reference frame and Q is the value being */
|
|
/* estimated */
|
|
/*
|
|
P0
|
|
x
|
|
P1
|
|
*/
|
|
src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 0
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 1
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid)); //Row 2
|
|
temp0 = _mm_avg_epu8(src_r0, src_r1);
|
|
temp1 = _mm_avg_epu8(src_r1, src_r2);
|
|
_mm_storel_epi64((__m128i *)out, temp0); //Row 0
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 1
|
|
|
|
ref+= 3*ref_wid;
|
|
out+= 2*out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *)ref); //Row 3
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 4
|
|
temp0 = _mm_avg_epu8(src_r2, src_r0);
|
|
temp1 = _mm_avg_epu8(src_r0, src_r1);
|
|
_mm_storel_epi64((__m128i *)out, temp0); //Row 2
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 3
|
|
|
|
ref += 2*ref_wid;
|
|
out+= 2*out_wid;
|
|
|
|
src_r2 = _mm_loadl_epi64((__m128i *)ref); //Row 5
|
|
src_r0 = _mm_loadl_epi64((__m128i *)(ref + ref_wid)); //Row 6
|
|
temp0 = _mm_avg_epu8(src_r1, src_r2);
|
|
temp1 = _mm_avg_epu8(src_r2, src_r0);
|
|
_mm_storel_epi64((__m128i *)out, temp0); //Row 4
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 5
|
|
|
|
ref += 2*ref_wid;
|
|
out+= 2*out_wid;
|
|
|
|
src_r1 = _mm_loadl_epi64((__m128i *)ref); //Row 7
|
|
src_r2 = _mm_loadl_epi64((__m128i *) (ref + ref_wid)); //Row 8
|
|
temp0 = _mm_avg_epu8(src_r0, src_r1);
|
|
temp1 = _mm_avg_epu8(src_r1, src_r2);
|
|
_mm_storel_epi64((__m128i *)out, temp0); //Row 6
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), temp1); //Row 7
|
|
|
|
return;
|
|
}
|
|
|
|
/*****************************************************************************/
|
|
/* */
|
|
/* Function Name : impeg2_mc_fullx_fully_8x8_sse42() */
|
|
/* */
|
|
/* Description : Gets the buffer from (x,y) to (x+8,y+8) */
|
|
/* and the above block of size 8 x 8 will be placed as a */
|
|
/* block from the current position of out_buf */
|
|
/* */
|
|
/* Inputs : ref - Reference frame from which the block will be */
|
|
/* block will be extracted. */
|
|
/* ref_wid - WIdth of reference frame */
|
|
/* out_wid - WIdth of the output frame */
|
|
/* blk_width - width of the block */
|
|
/* blk_width - height of the block */
|
|
/* */
|
|
/* Globals : None */
|
|
/* */
|
|
/* Processing : Point to the (0,0) position in the ref frame */
|
|
/* Get an 8 x 8 block from reference frame */
|
|
/* */
|
|
/* Outputs : out - Output containing the extracted block */
|
|
/* */
|
|
/* Returns : None */
|
|
/* */
|
|
/* Issues : None */
|
|
/* */
|
|
/*****************************************************************************/
|
|
void impeg2_mc_fullx_fully_8x8_sse42(UWORD8 *out,
|
|
UWORD8 *ref,
|
|
UWORD32 ref_wid,
|
|
UWORD32 out_wid)
|
|
{
|
|
__m128i src_r0, src_r1, src_r2, src_r3;
|
|
// Row 0-3
|
|
src_r0 = _mm_loadl_epi64((__m128i *)ref);
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
|
|
src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
|
|
|
|
_mm_storel_epi64((__m128i *)out, src_r0);
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
|
|
_mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
|
|
_mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
|
|
|
|
// Row 4-7
|
|
ref += 4 * ref_wid;
|
|
out += 4 * out_wid;
|
|
|
|
src_r0 = _mm_loadl_epi64((__m128i *)ref);
|
|
src_r1 = _mm_loadl_epi64((__m128i *)(ref + ref_wid));
|
|
src_r2 = _mm_loadl_epi64((__m128i *)(ref + 2 * ref_wid));
|
|
src_r3 = _mm_loadl_epi64((__m128i *)(ref + 3 * ref_wid));
|
|
|
|
_mm_storel_epi64((__m128i *)out, src_r0);
|
|
_mm_storel_epi64((__m128i *)(out + out_wid), src_r1);
|
|
_mm_storel_epi64((__m128i *)(out + 2 * out_wid), src_r2);
|
|
_mm_storel_epi64((__m128i *)(out + 3 * out_wid), src_r3);
|
|
return;
|
|
}
|