406 lines
15 KiB
ArmAsm
406 lines
15 KiB
ArmAsm
///*****************************************************************************
|
|
//*
|
|
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************/
|
|
///**
|
|
//*******************************************************************************
|
|
//* //file
|
|
//* ihevc_inter_pred_chroma_vert_neon.s
|
|
//*
|
|
//* //brief
|
|
//* contains function definitions for inter prediction interpolation.
|
|
//* functions are coded using neon intrinsics and can be compiled using
|
|
|
|
//* rvct
|
|
//*
|
|
//* //author
|
|
//* yogeswaran rs
|
|
//*
|
|
//* //par list of functions:
|
|
//*
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
///**
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* //brief
|
|
//* chroma interprediction filter for vertical input
|
|
//*
|
|
//* //par description:
|
|
//* applies a vertical filter with coefficients pointed to by 'pi1_coeff' to
|
|
//* the elements pointed by 'pu1_src' and writes to the location pointed by
|
|
//* 'pu1_dst' the output is down shifted by 6 and clipped to 8 bits
|
|
//* assumptions : the function is optimized considering the fact width is
|
|
//* multiple of 2,4 or 8. and also considering height should be multiple of 2
|
|
//* width 4,8 is optimized further
|
|
//*
|
|
//* //param[in] pu1_src
|
|
//* uword8 pointer to the source
|
|
//*
|
|
//* //param[out] pu1_dst
|
|
//* uword8 pointer to the destination
|
|
//*
|
|
//* //param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* //param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//* //param[in] pi1_coeff
|
|
//* word8 pointer to the filter coefficients
|
|
//*
|
|
//* //param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* //param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* //returns
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//void ihevc_inter_pred_chroma_vert(uword8 *pu1_src,
|
|
// uword8 *pu1_dst,
|
|
// word32 src_strd,
|
|
// word32 dst_strd,
|
|
// word8 *pi1_coeff,
|
|
// word32 ht,
|
|
// word32 wd)
|
|
//**************variables vs registers*****************************************
|
|
//x0 => *pu1_src
|
|
//x1 => *pi2_dst
|
|
//x2 => src_strd
|
|
//x3 => dst_strd
|
|
.text
|
|
.align 4
|
|
|
|
.include "ihevc_neon_macros.s"
|
|
|
|
.globl ihevc_inter_pred_chroma_vert_av8
|
|
|
|
.type ihevc_inter_pred_chroma_vert_av8, %function
|
|
|
|
ihevc_inter_pred_chroma_vert_av8:
|
|
|
|
// stmfd sp!,{x4-x12,x14} //stack stores the values of the arguments
|
|
|
|
stp x19, x20,[sp,#-16]!
|
|
|
|
mov x15,x4 // pi1_coeff
|
|
mov x16,x5 // ht
|
|
mov x17,x6 // wd
|
|
|
|
mov x4,x16 //loads ht
|
|
mov x12,x15 //loads pi1_coeff
|
|
cmp x4,#0 //checks ht == 0
|
|
mov x6,x17 //loads wd
|
|
sub x0,x0,x2 //pu1_src - src_strd
|
|
ld1 {v0.8b},[x12] //loads pi1_coeff
|
|
|
|
ble end_loops //jumps to end
|
|
|
|
tst x6,#3 //checks (wd & 3)
|
|
abs v3.8b, v0.8b //vabs_s8(coeff)
|
|
lsl x10,x6,#1 //2*wd
|
|
dup v0.8b, v3.b[0] //coeffabs_0
|
|
dup v1.8b, v3.b[1] //coeffabs_1
|
|
dup v2.8b, v3.b[2] //coeffabs_2
|
|
dup v3.8b, v3.b[3] //coeffabs_3
|
|
|
|
bgt outer_loop_wd_2 //jumps to loop handling wd ==2
|
|
|
|
tst x4,#7 //checks ht for mul of 8
|
|
beq core_loop_ht_8 //when height is multiple of 8
|
|
|
|
lsl x7,x3,#1 //2*dst_strd
|
|
sub x9,x7,x10 //2*dst_strd - 2wd
|
|
lsl x12,x2,#1 //2*src_strd
|
|
sub x8,x12,x10 //2*src_strd - 2wd
|
|
mov x5,x10 //2wd
|
|
|
|
inner_loop_ht_2: //called when wd is multiple of 4 and ht is 4,2
|
|
|
|
add x6,x0,x2 //pu1_src +src_strd
|
|
ld1 {v17.8b},[x6],x2 //loads pu1_src
|
|
subs x5,x5,#8 //2wd - 8
|
|
ld1 {v5.8b},[x0],#8 //loads src
|
|
umull v6.8h, v17.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
|
|
ld1 {v4.8b},[x6],x2 //loads incremented src
|
|
umlsl v6.8h, v5.8b, v0.8b //vmlsl_u8(mul_res1, vreinterpret_u8_u32(src_tmp1), coeffabs_0)
|
|
ld1 {v16.8b},[x6],x2 //loads incremented src
|
|
umlal v6.8h, v4.8b, v2.8b //vmlal_u8(mul_res1, vreinterpret_u8_u32(src_tmp3), coeffabs_2)
|
|
umull v4.8h, v4.8b, v1.8b
|
|
umlsl v6.8h, v16.8b, v3.8b
|
|
umlsl v4.8h, v17.8b, v0.8b
|
|
ld1 {v18.8b},[x6] //loads the incremented src
|
|
umlal v4.8h, v16.8b, v2.8b
|
|
sqrshrun v6.8b, v6.8h,#6 //shifts right
|
|
umlsl v4.8h, v18.8b, v3.8b
|
|
add x6,x1,x3 //pu1_dst + dst_strd
|
|
sqrshrun v4.8b, v4.8h,#6 //shifts right
|
|
st1 {v6.8b},[x1],#8 //stores the loaded value
|
|
|
|
st1 {v4.8b},[x6] //stores the loaded value
|
|
|
|
bgt inner_loop_ht_2 //inner loop again
|
|
|
|
subs x4,x4,#2 //ht - 2
|
|
add x1,x1,x9 //pu1_dst += (2*dst_strd - 2wd)
|
|
mov x5,x10 //2wd
|
|
add x0,x0,x8 //pu1_src += (2*src_strd - 2wd)
|
|
|
|
bgt inner_loop_ht_2 //loop again
|
|
|
|
b end_loops //jumps to end
|
|
|
|
outer_loop_wd_2: //called when width is multiple of 2
|
|
lsl x5,x3,#1 //2*dst_strd
|
|
mov x12,x10 //2wd
|
|
sub x9,x5,x10 //2*dst_strd - 2wd
|
|
lsl x7,x2,#1 //2*src_strd
|
|
sub x8,x7,x10 //2*src_strd - 2wd
|
|
|
|
inner_loop_wd_2:
|
|
|
|
add x6,x0,x2 //pu1_src + src_strd
|
|
ld1 {v6.s}[0],[x0] //vld1_lane_u32((uint32_t *)pu1_src_tmp, src_tmp1, 0
|
|
subs x12,x12,#4 //2wd - 4
|
|
add x0,x0,#4 //pu1_src + 4
|
|
ld1 {v6.s}[1],[x6],x2 //loads pu1_src_tmp
|
|
dup v7.2s, v6.s[1]
|
|
ld1 {v7.s}[1],[x6],x2 //loads pu1_src_tmp
|
|
umull v4.8h, v7.8b, v1.8b //vmull_u8(vreinterpret_u8_u32(src_tmp2), coeffabs_1)
|
|
dup v7.2s, v7.s[1]
|
|
ld1 {v7.s}[1],[x6],x2
|
|
umlsl v4.8h, v6.8b, v0.8b
|
|
umlal v4.8h, v7.8b, v2.8b
|
|
dup v7.2s, v7.s[1]
|
|
ld1 {v7.s}[1],[x6]
|
|
add x6,x1,x3 //pu1_dst + dst_strd
|
|
umlsl v4.8h, v7.8b, v3.8b
|
|
sqrshrun v4.8b, v4.8h,#6 //vrshrq_n_s16(vreinterpretq_s16_u16(mul_res1),6)
|
|
st1 {v4.s}[0],[x1] //stores the loaded value
|
|
add x1,x1,#4 //pu1_dst += 4
|
|
st1 {v4.s}[1],[x6] //stores the loaded value
|
|
|
|
bgt inner_loop_wd_2 //inner loop again
|
|
|
|
//inner loop ends
|
|
subs x4,x4,#2 //ht - 2
|
|
add x1,x1,x9 //pu1_dst += 2*dst_strd - 2*wd
|
|
mov x12,x10 //2wd
|
|
add x0,x0,x8 //pu1_src += 2*src_strd - 2*wd
|
|
|
|
bgt inner_loop_wd_2 //loop again
|
|
|
|
b end_loops //jumps to end
|
|
|
|
core_loop_ht_8: //when wd & ht is multiple of 8
|
|
|
|
lsl x12,x3,#2 //4*dst_strd
|
|
sub x8,x12,x10 //4*dst_strd - 2wd
|
|
lsl x12,x2,#2 //4*src_strd
|
|
sub x9,x12,x10 //4*src_strd - 2wd
|
|
|
|
bic x5,x10,#7 //x5 ->wd
|
|
lsr x14, x10, #3 //divide by 8
|
|
mul x12, x4 , x14 //multiply height by width
|
|
sub x12, x12,#4 //subtract by one for epilog
|
|
|
|
prolog:
|
|
add x6,x0,x2 //pu1_src + src_strd
|
|
ld1 {v5.8b},[x6],x2 //loads pu1_src
|
|
subs x5,x5,#8 //2wd - 8
|
|
ld1 {v4.8b},[x0],#8 //loads the source
|
|
ld1 {v6.8b},[x6],x2 //load and increment
|
|
umull v30.8h, v5.8b, v1.8b //mul with coeff 1
|
|
ld1 {v7.8b},[x6],x2 //load and increment
|
|
umlsl v30.8h, v4.8b, v0.8b
|
|
add x7,x1,x3 //pu1_dst
|
|
umlal v30.8h, v6.8b, v2.8b
|
|
umlsl v30.8h, v7.8b, v3.8b
|
|
ld1 {v16.8b},[x6],x2 //load and increment
|
|
|
|
umull v28.8h, v6.8b, v1.8b //mul_res 2
|
|
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
|
|
csel x0, x20, x0,le
|
|
umlsl v28.8h, v5.8b, v0.8b
|
|
bic x20,x10,#7 //x5 ->wd
|
|
csel x5, x20, x5,le
|
|
umlal v28.8h, v7.8b, v2.8b
|
|
ld1 {v17.8b},[x6],x2
|
|
umlsl v28.8h, v16.8b, v3.8b
|
|
sqrshrun v30.8b, v30.8h,#6
|
|
|
|
ld1 {v18.8b},[x6],x2
|
|
umull v26.8h, v7.8b, v1.8b
|
|
add x6,x0,x2 //pu1_src + src_strd
|
|
umlsl v26.8h, v6.8b, v0.8b
|
|
st1 {v30.8b},[x1],#8 //stores the loaded value
|
|
umlal v26.8h, v16.8b, v2.8b
|
|
ld1 {v4.8b},[x0],#8 //loads the source
|
|
umlsl v26.8h, v17.8b, v3.8b
|
|
sqrshrun v28.8b, v28.8h,#6
|
|
|
|
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
|
|
csel x1, x20, x1,le
|
|
umull v24.8h, v16.8b, v1.8b
|
|
ld1 {v5.8b},[x6],x2 //loads pu1_src
|
|
umlsl v24.8h, v7.8b, v0.8b
|
|
subs x12,x12,#4
|
|
ld1 {v6.8b},[x6],x2 //load and increment
|
|
umlal v24.8h, v17.8b, v2.8b
|
|
ld1 {v7.8b},[x6],x2 //load and increment
|
|
umlsl v24.8h, v18.8b, v3.8b
|
|
|
|
lsl x11,x2,#2
|
|
st1 {v28.8b},[x7],x3 //stores the loaded value
|
|
sqrshrun v26.8b, v26.8h,#6
|
|
sub x20,x2,x2,lsl #3
|
|
neg x11, x20
|
|
add x14,x2,x2,lsl #1
|
|
add x14,x14,x11
|
|
ble epilog //jumps to epilog
|
|
|
|
kernel_8:
|
|
|
|
umull v30.8h, v5.8b, v1.8b //mul with coeff 1
|
|
subs x5,x5,#8 //2wd - 8
|
|
umlsl v30.8h, v4.8b, v0.8b
|
|
add x20,x0,x9 //pu1_dst += 4*dst_strd - 2*wd
|
|
csel x0, x20, x0,le
|
|
umlal v30.8h, v6.8b, v2.8b
|
|
lsl x20,x2,#3
|
|
sub x20,x20,x2
|
|
csel x11,x20,x11,le
|
|
//rsble x11,x2,x2,lsl #3
|
|
umlsl v30.8h, v7.8b, v3.8b
|
|
st1 {v26.8b},[x7],x3 //stores the loaded value
|
|
sqrshrun v24.8b, v24.8h,#6
|
|
|
|
ld1 {v16.8b},[x6],x2 //load and increment
|
|
|
|
umull v28.8h, v6.8b, v1.8b //mul_res 2
|
|
bic x20,x10,#7 //x5 ->wd
|
|
csel x5, x20, x5,le
|
|
umlsl v28.8h, v5.8b, v0.8b
|
|
st1 {v24.8b},[x7],x3 //stores the loaded value
|
|
|
|
umlal v28.8h, v7.8b, v2.8b
|
|
|
|
ld1 {v17.8b},[x6],x2
|
|
sqrshrun v30.8b, v30.8h,#6
|
|
|
|
umlsl v28.8h, v16.8b, v3.8b
|
|
ld1 {v18.8b},[x6],x2
|
|
add x7,x1,x3 //pu1_dst
|
|
umull v26.8h, v7.8b, v1.8b
|
|
add x6,x0,x2 //pu1_src + src_strd
|
|
|
|
add x20,x0, x11
|
|
prfm PLDL1KEEP,[x20]
|
|
|
|
|
|
umlsl v26.8h, v6.8b, v0.8b
|
|
ld1 {v4.8b},[x0],#8 //loads the source
|
|
|
|
umlal v26.8h, v16.8b, v2.8b
|
|
st1 {v30.8b},[x1],#8 //stores the loaded value
|
|
|
|
umlsl v26.8h, v17.8b, v3.8b
|
|
ld1 {v5.8b},[x6],x2 //loads pu1_src
|
|
|
|
add x11,x11,x2
|
|
sqrshrun v28.8b, v28.8h,#6
|
|
|
|
umull v24.8h, v16.8b, v1.8b
|
|
ld1 {v6.8b},[x6],x2 //load and increment
|
|
add x20,x1,x8 //pu1_src += 4*src_strd - 2*wd
|
|
csel x1, x20, x1,le
|
|
|
|
cmp x11,x14
|
|
lsl x20,x2,#3
|
|
sub x20,x20,x2
|
|
csel x11,x20,x11,gt
|
|
//rsbgt x11,x2,x2,lsl #3
|
|
|
|
umlsl v24.8h, v7.8b, v0.8b
|
|
subs x12,x12,#4
|
|
|
|
umlal v24.8h, v17.8b, v2.8b
|
|
ld1 {v7.8b},[x6],x2 //load and increment
|
|
|
|
umlsl v24.8h, v18.8b, v3.8b
|
|
st1 {v28.8b},[x7],x3 //stores the loaded value
|
|
sqrshrun v26.8b, v26.8h,#6
|
|
|
|
bgt kernel_8 //jumps to kernel_8
|
|
|
|
epilog:
|
|
|
|
umull v30.8h, v5.8b, v1.8b //mul with coeff 1
|
|
umlsl v30.8h, v4.8b, v0.8b
|
|
umlal v30.8h, v6.8b, v2.8b
|
|
umlsl v30.8h, v7.8b, v3.8b
|
|
st1 {v26.8b},[x7],x3 //stores the loaded value
|
|
sqrshrun v24.8b, v24.8h,#6
|
|
|
|
ld1 {v16.8b},[x6],x2 //load and increment
|
|
umull v28.8h, v6.8b, v1.8b //mul_res 2
|
|
umlsl v28.8h, v5.8b, v0.8b
|
|
umlal v28.8h, v7.8b, v2.8b
|
|
umlsl v28.8h, v16.8b, v3.8b
|
|
st1 {v24.8b},[x7],x3 //stores the loaded value
|
|
sqrshrun v30.8b, v30.8h,#6
|
|
|
|
ld1 {v17.8b},[x6],x2
|
|
umull v26.8h, v7.8b, v1.8b
|
|
add x7,x1,x3 //pu1_dst
|
|
umlsl v26.8h, v6.8b, v0.8b
|
|
st1 {v30.8b},[x1],#8 //stores the loaded value
|
|
|
|
sqrshrun v28.8b, v28.8h,#6
|
|
umlal v26.8h, v16.8b, v2.8b
|
|
ld1 {v18.8b},[x6],x2
|
|
umlsl v26.8h, v17.8b, v3.8b
|
|
|
|
umull v24.8h, v16.8b, v1.8b
|
|
sqrshrun v26.8b, v26.8h,#6
|
|
st1 {v28.8b},[x7],x3 //stores the loaded value
|
|
umlsl v24.8h, v7.8b, v0.8b
|
|
umlal v24.8h, v17.8b, v2.8b
|
|
st1 {v26.8b},[x7],x3 //stores the loaded value
|
|
umlsl v24.8h, v18.8b, v3.8b
|
|
|
|
sqrshrun v24.8b, v24.8h,#6
|
|
st1 {v24.8b},[x7],x3 //stores the loaded value
|
|
end_loops:
|
|
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
|
|
ldp x19, x20,[sp],#16
|
|
|
|
ret
|
|
|
|
|
|
|