815 lines
25 KiB
ArmAsm
815 lines
25 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
///*
|
|
////----------------------------------------------------------------------------
|
|
//// File Name : impeg2_inter_pred.s
|
|
////
|
|
//// Description : This file has motion compensation related
|
|
//// interpolation functions on Neon + CortexA-8 platform
|
|
////
|
|
//// Reference Document :
|
|
////
|
|
//// Revision History :
|
|
//// Date Author Detail Description
|
|
//// ------------ ---------------- ----------------------------------
|
|
//// 18 jun 2010 S Hamsalekha Created
|
|
////
|
|
////-------------------------------------------------------------------------
|
|
//*/
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Include Files
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
// PRESERVE8
|
|
.text
|
|
.include "impeg2_neon_macros.s"
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Struct/Union Types and Define
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Static Global Data section variables
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
//// -------------------------- NONE --------------------------------------------
|
|
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Static Prototype Functions
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
//// -------------------------- NONE --------------------------------------------
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Exported functions
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : impeg2_copy_mb_av8()
|
|
////
|
|
//// Detail Description : Copies one MB worth of data from src to the dst
|
|
////
|
|
//// Inputs : x0 - pointer to src
|
|
//// x1 - pointer to dst
|
|
//// x2 - source width
|
|
//// x3 - destination width
|
|
//// Registers Used : v0, v1
|
|
////
|
|
//// Stack Usage : 64 bytes
|
|
////
|
|
//// Outputs :
|
|
////
|
|
//// Return Data : None
|
|
////
|
|
//// Programming Note : <program limitation>
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
|
|
.global impeg2_copy_mb_av8
|
|
|
|
|
|
impeg2_copy_mb_av8:
|
|
|
|
//STMFD x13!,{x4,x5,x12,x14}
|
|
push_v_regs
|
|
|
|
|
|
ldr x4, [x0] //src->y
|
|
ldr x5, [x1] //dst->y
|
|
|
|
//Read one row of data from the src
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
|
|
////Repeat 15 times for y
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b, v1.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b, v1.8b}, [x5], x3 //Store and increment dst
|
|
|
|
lsr x2, x2, #1 //src_offset /= 2
|
|
lsr x3, x3, #1 //dst_offset /= 2
|
|
|
|
ldr x4, [x0, #8] //src->u
|
|
ldr x5, [x1, #8] //dst->u
|
|
|
|
//Read one row of data from the src
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
|
|
////Repeat 7 times for u
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
|
|
ldr x4, [x0, #16] //src->v
|
|
ldr x5, [x1, #16] //dst->v
|
|
|
|
//Read one row of data from the src
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
|
|
////Repeat 7 times for v
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
ld1 {v0.8b}, [x4], x2 //Load and increment src
|
|
st1 {v0.8b}, [x5], x3 //Store and increment dst
|
|
|
|
//LDMFD x13!,{x4,x5,x12,PC}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : impeg2_mc_fullx_halfy_8x8_av8()
|
|
////
|
|
//// Detail Description : This function pastes the reference block in the
|
|
//// current frame buffer.This function is called for
|
|
//// blocks that are not coded and have motion vectors
|
|
//// with a half pel resolution.
|
|
////
|
|
//// Inputs : x0 - out : Current Block Pointer
|
|
//// x1 - ref : Refernce Block Pointer
|
|
//// x2 - ref_wid : Refernce Block Width
|
|
//// x3 - out_wid @ Current Block Width
|
|
////
|
|
//// Registers Used : x14, D0-D9
|
|
////
|
|
//// Stack Usage : 64 bytes
|
|
////
|
|
//// Outputs : The Motion Compensated Block
|
|
////
|
|
//// Return Data : None
|
|
////
|
|
//// Programming Note : <program limitation>
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
.global impeg2_mc_fullx_halfy_8x8_av8
|
|
|
|
impeg2_mc_fullx_halfy_8x8_av8:
|
|
|
|
//STMFD x13!,{x12,x14}
|
|
push_v_regs
|
|
add x14, x1, x2
|
|
lsl x2, x2, #1
|
|
|
|
///* Load 8 + 1 rows from reference block */
|
|
///* Do the addition with out rounding off as rounding value is 1 */
|
|
ld1 {v0.8b}, [x1], x2 //// first row hence x1 = D0
|
|
ld1 {v2.8b}, [x14], x2 //// second row hence x2 = D2
|
|
ld1 {v4.8b}, [x1], x2 //// third row hence x3 = D4
|
|
ld1 {v6.8b}, [x14], x2 //// fourth row hence x4 = D6
|
|
ld1 {v1.8b}, [x1], x2 //// fifth row hence x5 = D1
|
|
ld1 {v3.8b}, [x14], x2 //// sixth row hence x6 = D3
|
|
urhadd v9.8b, v1.8b , v6.8b //// estimated row 4 = D9
|
|
ld1 {v5.8b}, [x1], x2 //// seventh row hence x7 = D5
|
|
urhadd v0.16b, v0.16b , v2.16b //// estimated row 1 = D0, row 5 = D1
|
|
urhadd v1.16b, v1.16b , v3.16b //// estimated row 1 = D0, row 5 = D1
|
|
ld1 {v7.8b}, [x14], x2 //// eighth row hence x8 = D7
|
|
urhadd v2.16b, v2.16b , v4.16b //// estimated row 2 = D2, row 6 = D3
|
|
urhadd v3.16b, v3.16b , v5.16b //// estimated row 2 = D2, row 6 = D3
|
|
ld1 {v8.8b}, [x1], x2 //// ninth row hence x9 = D8
|
|
urhadd v4.16b, v4.16b , v6.16b //// estimated row 3 = D4, row 7 = D5
|
|
urhadd v5.16b, v5.16b , v7.16b //// estimated row 3 = D4, row 7 = D5
|
|
|
|
add x14, x0, x3
|
|
lsl x3, x3, #1
|
|
|
|
///* Store the eight rows calculated above */
|
|
st1 {v2.8b}, [x14], x3 //// second row hence D2
|
|
urhadd v7.8b, v7.8b , v8.8b //// estimated row 8 = D7
|
|
st1 {v0.8b}, [x0], x3 //// first row hence D0
|
|
st1 {v9.8b}, [x14], x3 //// fourth row hence D9
|
|
st1 {v4.8b}, [x0], x3 //// third row hence D4
|
|
st1 {v3.8b}, [x14], x3 //// sixth row hence x6 = D3
|
|
st1 {v1.8b}, [x0], x3 //// fifth row hence x5 = D1
|
|
st1 {v7.8b}, [x14], x3 //// eighth row hence x8 = D7
|
|
st1 {v5.8b}, [x0], x3 //// seventh row hence x7 = D5
|
|
|
|
// LDMFD sp!,{x12,pc}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : impeg2_mc_halfx_fully_8x8_av8()
|
|
////
|
|
//// Detail Description : This function pastes the reference block in the
|
|
//// current frame buffer.This function is called for
|
|
//// blocks that are not coded and have motion vectors
|
|
//// with a half pel resolutionand VopRoundingType is 0 ..
|
|
////
|
|
//// Inputs : x0 - out : Current Block Pointer
|
|
//// x1 - ref : Refernce Block Pointer
|
|
//// x2 - ref_wid : Refernce Block Width
|
|
//// x3 - out_wid @ Current Block Width
|
|
////
|
|
//// Registers Used : x12, x14, v0-v10, v12-v14, v16-v18, v20-v22
|
|
|
|
////
|
|
//// Stack Usage : 64 bytes
|
|
////
|
|
//// Outputs : The Motion Compensated Block
|
|
////
|
|
//// Return Data : None
|
|
////
|
|
//// Programming Note : <program limitation>
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
|
|
.global impeg2_mc_halfx_fully_8x8_av8
|
|
|
|
|
|
|
|
impeg2_mc_halfx_fully_8x8_av8:
|
|
|
|
// STMFD sp!,{x12,x14}
|
|
push_v_regs
|
|
|
|
add x14, x1, x2, lsl #2
|
|
|
|
add x12, x0, x3, lsl#2
|
|
|
|
ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1
|
|
|
|
ld1 {v2.8b, v3.8b}, [x14], x2 // row5
|
|
|
|
|
|
ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
|
|
|
|
ld1 {v6.8b, v7.8b}, [x14], x2 //row6
|
|
|
|
|
|
ext v8.8b, v0.8b , v1.8b , #1
|
|
|
|
ext v12.8b, v2.8b , v3.8b , #1
|
|
|
|
ext v16.8b, v4.8b , v5.8b , #1
|
|
|
|
ext v20.8b, v6.8b , v7.8b , #1
|
|
|
|
|
|
ld1 {v9.8b, v10.8b}, [x1], x2 //load row3
|
|
|
|
ld1 {v13.8b, v14.8b}, [x14], x2 //load row7
|
|
|
|
ld1 {v17.8b, v18.8b}, [x1], x2 //load row4
|
|
|
|
ld1 {v21.8b, v22.8b}, [x14], x2 //load row8
|
|
|
|
|
|
ext v1.8b, v9.8b , v10.8b , #1
|
|
|
|
ext v3.8b, v13.8b , v14.8b , #1
|
|
|
|
|
|
|
|
ext v5.8b, v17.8b , v18.8b , #1
|
|
|
|
ext v7.8b, v21.8b , v22.8b , #1
|
|
|
|
|
|
urhadd v0.16b, v0.16b , v8.16b //operate on row1 and row3
|
|
urhadd v1.16b, v1.16b , v9.16b //operate on row1 and row3
|
|
|
|
urhadd v2.16b, v2.16b , v12.16b //operate on row5 and row7
|
|
urhadd v3.16b, v3.16b , v13.16b //operate on row5 and row7
|
|
|
|
|
|
urhadd v4.16b, v4.16b , v16.16b //operate on row2 and row4
|
|
urhadd v5.16b, v5.16b , v17.16b //operate on row2 and row4
|
|
|
|
|
|
urhadd v6.16b, v6.16b , v20.16b //operate on row6 and row8
|
|
urhadd v7.16b, v7.16b , v21.16b //operate on row6 and row8
|
|
|
|
st1 {v0.8b}, [x0], x3 //store row1
|
|
|
|
st1 {v2.8b}, [x12], x3 //store row5
|
|
|
|
st1 {v4.8b}, [x0], x3 //store row2
|
|
|
|
st1 {v6.8b}, [x12], x3 //store row6
|
|
|
|
st1 {v1.8b}, [x0], x3 //store row3
|
|
|
|
st1 {v3.8b}, [x12], x3 //store row7
|
|
|
|
st1 {v5.8b}, [x0], x3 //store row4
|
|
|
|
st1 {v7.8b}, [x12], x3 //store row8
|
|
|
|
|
|
|
|
// LDMFD sp!,{x12,pc}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : impeg2_mc_halfx_halfy_8x8_av8()
|
|
////
|
|
//// Detail Description : This function pastes the reference block in the
|
|
//// current frame buffer.This function is called for
|
|
//// blocks that are not coded and have motion vectors
|
|
//// with a half pel resolutionand VopRoundingType is 0 ..
|
|
////
|
|
//// Inputs : x0 - out : Current Block Pointer
|
|
//// x1 - ref : Refernce Block Pointer
|
|
//// x2 - ref_wid : Refernce Block Width
|
|
//// x3 - out_wid @ Current Block Width
|
|
////
|
|
//// Registers Used : x14, v0-v18, v22, v24, v26, v28, v30
|
|
|
|
////
|
|
//// Stack Usage : 64 bytes
|
|
////
|
|
//// Outputs : The Motion Compensated Block
|
|
////
|
|
//// Return Data : None
|
|
////
|
|
//// Programming Note : <program limitation>
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
.global impeg2_mc_halfx_halfy_8x8_av8
|
|
|
|
impeg2_mc_halfx_halfy_8x8_av8:
|
|
|
|
// STMFD sp!,{x12,x14}
|
|
push_v_regs
|
|
|
|
add x14, x1, x2, lsl #2
|
|
|
|
ld1 {v0.8b, v1.8b}, [x1], x2 //load 16 pixels of row1
|
|
|
|
ld1 {v2.8b, v3.8b}, [x14], x2 // row5
|
|
|
|
ld1 {v4.8b, v5.8b}, [x1], x2 //load 16 pixels row2
|
|
|
|
ld1 {v6.8b, v7.8b}, [x14], x2 //row6
|
|
|
|
ext v1.8b, v0.8b , v1.8b , #1
|
|
|
|
|
|
|
|
ext v3.8b, v2.8b , v3.8b , #1
|
|
|
|
|
|
|
|
ext v5.8b, v4.8b , v5.8b , #1
|
|
|
|
ext v7.8b, v6.8b , v7.8b , #1
|
|
|
|
|
|
|
|
|
|
ld1 {v8.8b, v9.8b}, [x1], x2 //load row3
|
|
|
|
|
|
|
|
ld1 {v10.8b, v11.8b}, [x14], x2 //load row7
|
|
|
|
ld1 {v12.8b, v13.8b}, [x1], x2 //load row4
|
|
|
|
ld1 {v14.8b, v15.8b}, [x14], x2 //load row8
|
|
|
|
ext v9.8b, v8.8b , v9.8b , #1
|
|
|
|
ld1 {v16.8b, v17.8b}, [x14], x2 //load row9
|
|
|
|
|
|
|
|
|
|
|
|
ext v11.8b, v10.8b , v11.8b , #1
|
|
|
|
|
|
|
|
ext v13.8b, v12.8b , v13.8b , #1
|
|
|
|
|
|
|
|
ext v15.8b, v14.8b , v15.8b , #1
|
|
|
|
ext v17.8b, v16.8b , v17.8b , #1
|
|
|
|
|
|
//interpolation in x direction
|
|
|
|
uaddl v0.8h, v0.8b, v1.8b //operate row1
|
|
|
|
uaddl v2.8h, v2.8b, v3.8b //operate row5
|
|
|
|
uaddl v4.8h, v4.8b, v5.8b //operate row2
|
|
|
|
uaddl v6.8h, v6.8b, v7.8b //operate row6
|
|
|
|
uaddl v8.8h, v8.8b, v9.8b //operate row3
|
|
|
|
uaddl v10.8h, v10.8b, v11.8b //operate row7
|
|
|
|
uaddl v12.8h, v12.8b, v13.8b //operate row4
|
|
|
|
uaddl v14.8h, v14.8b, v15.8b //operate row8
|
|
|
|
uaddl v16.8h, v16.8b, v17.8b //operate row9
|
|
|
|
//interpolation in y direction
|
|
|
|
add x14, x0, x3, lsl #2
|
|
|
|
|
|
|
|
add v18.8h, v0.8h , v4.8h //operate row1 and row2
|
|
|
|
add v26.8h, v2.8h , v6.8h //operate row5 and row6
|
|
|
|
add v20.8h, v4.8h , v8.8h //operate row2 and row3
|
|
|
|
add v28.8h, v6.8h , v10.8h //operate row6 and row7
|
|
|
|
rshrn v18.8b, v18.8h, #2 //row1
|
|
|
|
rshrn v26.8b, v26.8h, #2 //row5
|
|
|
|
rshrn v20.8b, v20.8h, #2 //row2
|
|
|
|
rshrn v28.8b, v28.8h, #2 //row6
|
|
|
|
add v22.8h, v8.8h , v12.8h //operate row3 and row4
|
|
|
|
st1 {v18.8b}, [x0], x3 //store row1
|
|
|
|
add v30.8h, v10.8h , v14.8h //operate row7 and row8
|
|
|
|
st1 {v26.8b}, [x14], x3 //store row5
|
|
|
|
add v24.8h, v12.8h , v2.8h //operate row4 and row5
|
|
|
|
st1 {v20.8b}, [x0], x3 //store row2
|
|
|
|
add v14.8h, v14.8h , v16.8h //operate row8 and row9
|
|
|
|
st1 {v28.8b}, [x14], x3 //store row6
|
|
|
|
|
|
|
|
rshrn v22.8b, v22.8h, #2 //row3
|
|
|
|
rshrn v30.8b, v30.8h, #2 //row7
|
|
|
|
rshrn v24.8b, v24.8h, #2 //row4
|
|
|
|
rshrn v14.8b, v14.8h, #2 //row8
|
|
|
|
|
|
st1 {v22.8b}, [x0], x3 //store row3
|
|
st1 {v30.8b}, [x14], x3 //store row7
|
|
st1 {v24.8b}, [x0], x3 //store row4
|
|
st1 {v14.8b}, [x14], x3 //store row8
|
|
|
|
|
|
|
|
// LDMFD sp!,{x12,pc}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : impeg2_mc_fullx_fully_8x8_av8()
|
|
////
|
|
//// Detail Description : This function pastes the reference block in the
|
|
//// current frame buffer.This function is called for
|
|
//// blocks that are not coded and have motion vectors
|
|
//// with a half pel resolutionand ..
|
|
////
|
|
//// Inputs : x0 - out : Current Block Pointer
|
|
//// x1 - ref : Refernce Block Pointer
|
|
//// x2 - ref_wid : Refernce Block Width
|
|
//// x3 - out_wid @ Current Block Width
|
|
////
|
|
//// Registers Used : x12, x14, v0-v3
|
|
|
|
////
|
|
//// Stack Usage : 64 bytes
|
|
////
|
|
//// Outputs : The Motion Compensated Block
|
|
////
|
|
//// Return Data : None
|
|
////
|
|
//// Programming Note : <program limitation>
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
.global impeg2_mc_fullx_fully_8x8_av8
|
|
impeg2_mc_fullx_fully_8x8_av8:
|
|
|
|
|
|
// STMFD sp!,{x12,x14}
|
|
push_v_regs
|
|
|
|
add x14, x1, x2, lsl #2
|
|
|
|
add x12, x0, x3, lsl #2
|
|
|
|
|
|
ld1 {v0.8b}, [x1], x2 //load row1
|
|
|
|
ld1 {v1.8b}, [x14], x2 //load row4
|
|
|
|
ld1 {v2.8b}, [x1], x2 //load row2
|
|
|
|
ld1 {v3.8b}, [x14], x2 //load row5
|
|
|
|
|
|
st1 {v0.8b}, [x0], x3 //store row1
|
|
|
|
st1 {v1.8b}, [x12], x3 //store row4
|
|
|
|
st1 {v2.8b}, [x0], x3 //store row2
|
|
|
|
st1 {v3.8b}, [x12], x3 //store row5
|
|
|
|
|
|
ld1 {v0.8b}, [x1], x2 //load row3
|
|
|
|
ld1 {v1.8b}, [x14], x2 //load row6
|
|
|
|
ld1 {v2.8b}, [x1], x2 //load row4
|
|
|
|
ld1 {v3.8b}, [x14], x2 //load row8
|
|
|
|
|
|
st1 {v0.8b}, [x0], x3 //store row3
|
|
|
|
st1 {v1.8b}, [x12], x3 //store row6
|
|
|
|
st1 {v2.8b}, [x0], x3 //store row4
|
|
|
|
st1 {v3.8b}, [x12], x3 //store row8
|
|
|
|
|
|
// LDMFD sp!,{x12,pc}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
///*
|
|
////---------------------------------------------------------------------------
|
|
//// Function Name : impeg2_interpolate_av8()
|
|
////
|
|
//// Detail Description : interpolates two buffers and adds pred
|
|
////
|
|
//// Inputs : x0 - pointer to src1
|
|
//// x1 - pointer to src2
|
|
//// x2 - dest buf
|
|
//// x3 - dst stride
|
|
//// Registers Used : x12, v0-v15
|
|
////
|
|
//// Stack Usage : 64 bytes
|
|
////
|
|
//// Outputs : The Motion Compensated Block
|
|
////
|
|
//// Return Data : None
|
|
////
|
|
//// Programming Note : <program limitation>
|
|
////-----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
.global impeg2_interpolate_av8
|
|
|
|
|
|
impeg2_interpolate_av8:
|
|
|
|
//STMFD x13!,{x4-x7,x12,x14}
|
|
push_v_regs
|
|
|
|
ldr x4, [x0, #0] //ptr_y src1
|
|
|
|
ldr x5, [x1, #0] //ptr_y src2
|
|
|
|
ldr x7, [x2, #0] //ptr_y dst buf
|
|
|
|
mov x12, #4 //counter for number of blocks
|
|
|
|
|
|
interp_lumablocks_stride:
|
|
ld1 {v0.16b}, [x4], #16 //row1 src1
|
|
|
|
ld1 {v2.16b}, [x4], #16 //row2 src1
|
|
|
|
ld1 {v4.16b}, [x4], #16 //row3 src1
|
|
|
|
ld1 {v6.16b}, [x4], #16 //row4 src1
|
|
|
|
|
|
ld1 {v8.16b}, [x5], #16 //row1 src2
|
|
|
|
ld1 {v10.16b}, [x5], #16 //row2 src2
|
|
|
|
ld1 {v12.16b}, [x5], #16 //row3 src2
|
|
|
|
ld1 {v14.16b}, [x5], #16 //row4 src2
|
|
|
|
urhadd v0.16b, v0.16b , v8.16b //operate on row1
|
|
|
|
urhadd v2.16b, v2.16b , v10.16b //operate on row2
|
|
|
|
urhadd v4.16b, v4.16b , v12.16b //operate on row3
|
|
|
|
urhadd v6.16b, v6.16b , v14.16b //operate on row4
|
|
st1 {v0.16b}, [x7], x3 //row1
|
|
|
|
st1 {v2.16b}, [x7], x3 //row2
|
|
|
|
st1 {v4.16b}, [x7], x3 //row3
|
|
|
|
st1 {v6.16b}, [x7], x3 //row4
|
|
|
|
subs x12, x12, #1
|
|
|
|
bne interp_lumablocks_stride
|
|
|
|
|
|
lsr x3, x3, #1 //stride >> 1
|
|
|
|
ldr x4, [x0, #8] //ptr_u src1
|
|
|
|
ldr x5, [x1, #8] //ptr_u src2
|
|
|
|
ldr x7 , [x2, #8] //ptr_u dst buf
|
|
|
|
mov x12, #2 //counter for number of blocks
|
|
|
|
|
|
|
|
//chroma blocks
|
|
|
|
interp_chromablocks_stride:
|
|
ld1 {v0.8b, v1.8b}, [x4], #16 //row1 & 2 src1
|
|
|
|
ld1 {v2.8b, v3.8b}, [x4], #16 //row3 & 4 src1
|
|
|
|
ld1 {v4.8b, v5.8b}, [x4], #16 //row5 & 6 src1
|
|
|
|
ld1 {v6.8b, v7.8b}, [x4], #16 //row7 & 8 src1
|
|
|
|
|
|
ld1 {v8.8b, v9.8b}, [x5], #16 //row1 & 2 src2
|
|
|
|
ld1 {v10.8b, v11.8b}, [x5], #16 //row3 & 4 src2
|
|
|
|
ld1 {v12.8b, v13.8b}, [x5], #16 //row5 & 6 src2
|
|
|
|
ld1 {v14.8b, v15.8b}, [x5], #16 //row7 & 8 src2
|
|
|
|
urhadd v0.16b, v0.16b , v8.16b //operate on row1 & 2
|
|
urhadd v1.16b, v1.16b , v9.16b //operate on row1 & 2
|
|
|
|
urhadd v2.16b, v2.16b , v10.16b //operate on row3 & 4
|
|
urhadd v3.16b, v3.16b , v11.16b //operate on row3 & 4
|
|
|
|
urhadd v4.16b, v4.16b , v12.16b //operate on row5 & 6
|
|
urhadd v5.16b, v5.16b , v13.16b //operate on row5 & 6
|
|
|
|
urhadd v6.16b, v6.16b , v14.16b //operate on row7 & 8
|
|
urhadd v7.16b, v7.16b , v15.16b //operate on row7 & 8
|
|
|
|
st1 {v0.8b}, [x7], x3 //row1
|
|
|
|
st1 {v1.8b}, [x7], x3 //row2
|
|
|
|
st1 {v2.8b}, [x7], x3 //row3
|
|
|
|
st1 {v3.8b}, [x7], x3 //row4
|
|
|
|
st1 {v4.8b}, [x7], x3 //row5
|
|
|
|
st1 {v5.8b}, [x7], x3 //row6
|
|
|
|
st1 {v6.8b}, [x7], x3 //row7
|
|
|
|
st1 {v7.8b}, [x7], x3 //row8
|
|
|
|
|
|
ldr x4, [x0, #16] //ptr_v src1
|
|
|
|
ldr x5, [x1, #16] //ptr_v src2
|
|
|
|
ldr x7, [x2, #16] //ptr_v dst buf
|
|
|
|
subs x12, x12, #1
|
|
|
|
bne interp_chromablocks_stride
|
|
|
|
|
|
//LDMFD x13!,{x4-x7,x12,PC}
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|