542 lines
22 KiB
ArmAsm
542 lines
22 KiB
ArmAsm
///*****************************************************************************
|
|
//*
|
|
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************/
|
|
///**
|
|
//*******************************************************************************
|
|
//* @file
|
|
//* ihevc_weighted_pred_bi_default.s
|
|
//*
|
|
//* @brief
|
|
//* contains function definitions for weighted prediction used in inter
|
|
//* prediction
|
|
//*
|
|
//* @author
|
|
//* parthiban v
|
|
//*
|
|
//* @par list of functions:
|
|
//* - ihevc_weighted_pred_bi_default()
|
|
//*
|
|
//* @remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* does default bi-weighted prediction on the arrays pointed by pi2_src1 and
|
|
//* pi2_src2 and stores it at location pointed by pi2_dst assumptions : the
|
|
//* function is optimized considering the fact width and height are multiple
|
|
//* of 2.
|
|
//*
|
|
//* @par description:
|
|
//* dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
|
|
//* >> shift where shift = 15 - bitdepth
|
|
//*
|
|
//* @param[in] pi2_src1
|
|
//* pointer to source 1
|
|
//*
|
|
//* @param[in] pi2_src2
|
|
//* pointer to source 2
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* pointer to destination
|
|
//*
|
|
//* @param[in] src_strd1
|
|
//* source stride 1
|
|
//*
|
|
//* @param[in] src_strd2
|
|
//* source stride 2
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* destination stride
|
|
//*
|
|
//* @param[in] lvl_shift1
|
|
//* added before shift and offset
|
|
//*
|
|
//* @param[in] lvl_shift2
|
|
//* added before shift and offset
|
|
//*
|
|
//* @param[in] ht
|
|
//* height of the source
|
|
//*
|
|
//* @param[in] wd
|
|
//* width of the source
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
|
|
// word16 *pi2_src2,
|
|
// uword8 *pu1_dst,
|
|
// word32 src_strd1,
|
|
// word32 src_strd2,
|
|
// word32 dst_strd,
|
|
// word32 lvl_shift1,
|
|
// word32 lvl_shift2,
|
|
// word32 ht,
|
|
// word32 wd)
|
|
|
|
//**************variables vs registers*****************************************
|
|
// x0 => *pi2_src1
|
|
// x1 => *pi2_src2
|
|
// x2 => *pu1_dst
|
|
// x3 => src_strd1
|
|
// x4 => src_strd2
|
|
// x5 => dst_strd
|
|
// x6 => lvl_shift1
|
|
// x7 => lvl_shift2
|
|
// x8 => ht
|
|
// x9 => wd
|
|
.text
|
|
.align 4
|
|
|
|
.include "ihevc_neon_macros.s"
|
|
|
|
.globl ihevc_weighted_pred_bi_default_av8
|
|
|
|
.type ihevc_weighted_pred_bi_default_av8, %function
|
|
|
|
ihevc_weighted_pred_bi_default_av8:
|
|
|
|
ldr w8,[sp,#0]
|
|
ldr w9,[sp,#8]
|
|
|
|
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
|
|
|
|
stp x19, x20,[sp,#-16]!
|
|
stp x21, x22,[sp,#-16]!
|
|
|
|
mov x15,x4 // src_strd2 40
|
|
mov x16,x5 // dst_strd 44
|
|
mov x17,x6 // lvl_shift1 48
|
|
mov x19,x7 // lvl_shift2 52
|
|
mov x20,x8 // ht 56
|
|
mov x21,x9 // wd 60
|
|
|
|
mov x4,x15 //load src_strd2
|
|
lsl x3,x3,#1
|
|
mov x5,x16 //load dst_strd
|
|
mov x6,x17 //load lvl_shift1
|
|
lsl x4,x4,#1
|
|
mov x7,x19 //load lvl_shift2
|
|
mov x8,x20 //load ht
|
|
mov x9,x21 //load wd
|
|
dup v4.8h,w6 //lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
|
|
dup v6.8h,w7 //lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
|
|
movi v0.8h, #0x40 //tmp_lvl_shift = 1 << (shift - 1)
|
|
add v4.8h, v4.8h,v6.8h
|
|
add v0.8h, v0.8h , v4.8h
|
|
// vmvn.i32 v2.8h,#0x6 @vmovq_n_s32(tmp_shift)
|
|
lsl x6,x9,#1
|
|
sub x20,x6,x3,lsl #2 //4*src_strd1 - wd
|
|
neg x7, x20
|
|
sub x20,x6,x4,lsl #2 //4*src_strd2 - wd
|
|
neg x10, x20
|
|
//asr x6,#1
|
|
//rsb x6,x6,x5,lsl #2 @4*dst_strd - wd
|
|
|
|
cmp x8,#0 //check ht == 0
|
|
beq end_loops //if equal, then end the function
|
|
|
|
chroma_decision:
|
|
orr x14,x8,x9
|
|
cmp x14,#10
|
|
beq outer_loop_chroma_8x2
|
|
|
|
cmp x14,#6
|
|
beq outer_loop_chroma_4x2
|
|
|
|
|
|
luma_decision:
|
|
cmp x9,#24
|
|
beq outer_loop_8
|
|
|
|
cmp x9,#16
|
|
bge outer_loop_16
|
|
|
|
cmp x9,#12
|
|
beq outer_loop_4
|
|
|
|
cmp x9,#8
|
|
bge outer_loop_8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
outer_loop_4:
|
|
cmp x9,#0 //check wd == 0
|
|
beq end_loops //if equal, then end the function
|
|
|
|
core_loop_4:
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1
|
|
add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
|
|
ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2
|
|
ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
|
|
sqadd v18.4h,v6.4h,v7.4h
|
|
sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
|
|
ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
|
|
sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
|
|
sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
|
|
mov v18.d[1],v19.d[0]
|
|
sqshrun v20.8b, v18.8h,#7
|
|
ld1 {v22.4h},[x11],x3 //load and increment the pi2_src1 iii iteration
|
|
ld1 {v23.4h},[x12],x4 //load and increment the pi2_src2 iii iteration
|
|
sqadd v30.4h,v22.4h,v23.4h
|
|
sqadd v30.4h,v30.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
|
|
ld1 {v24.4h},[x11],x3 //load and increment the pi2_src1 iv iteration
|
|
ld1 {v25.4h},[x12],x4 //load and increment the pi2_src2 iv iteration
|
|
sqadd v18.4h,v24.4h,v25.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
|
|
sqadd v31.4h,v18.4h,v0.4h
|
|
mov v30.d[1],v31.d[0]
|
|
st1 {v20.s}[0],[x2],#4 //store pu1_dst i iteration
|
|
st1 {v20.s}[1],[x14],x5 //store pu1_dst ii iteration
|
|
sqshrun v30.8b, v30.8h,#7
|
|
st1 {v30.s}[0],[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
|
|
subs x9,x9,#4 //decrement wd by 4 and check for 0
|
|
st1 {v30.s}[1],[x14],x5 //store pu1_dst iv iteration
|
|
bgt core_loop_4 //if greater than 0 repeat the core loop again
|
|
|
|
end_core_loop_4:
|
|
|
|
subs x8,x8,#4 //decrement the ht by 4
|
|
|
|
add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
|
|
asr x9,x6,#1
|
|
add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd
|
|
sub x20,x9,x5,lsl #2 //4*dst_strd - wd
|
|
neg x14, x20
|
|
add x2,x2,x14
|
|
//pu1_dst + dst_std - wd
|
|
bgt core_loop_4 //if ht is greater than 0 goto outer_loop
|
|
|
|
b end_loops
|
|
|
|
|
|
// this is only for chroma module with input 2x2
|
|
outer_loop_chroma_4x2:
|
|
cmp x9,#0 //check wd == 0
|
|
beq end_loops //if equal, then end the function
|
|
sub x20,x6,x3,lsl #1 //2*src_strd1 - wd
|
|
neg x7, x20
|
|
sub x20,x6,x4,lsl #1 //2*src_strd2 - wd
|
|
neg x10, x20
|
|
core_loop_chroma_4x2:
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
ld1 {v6.4h},[x0],#8 //load and increment the pi2_src1
|
|
add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
|
|
ld1 {v7.4h},[x1],#8 //load and increment the pi2_src2
|
|
ld1 {v1.4h},[x11],x3 //load and increment the pi2_src1 ii iteration
|
|
sqadd v18.4h,v6.4h,v7.4h
|
|
sqadd v18.4h,v18.4h,v0.4h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
|
|
ld1 {v3.4h},[x12],x4 //load and increment the pi2_src2 ii iteration
|
|
sqadd v20.4h,v1.4h,v3.4h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
|
|
sqadd v19.4h,v20.4h,v0.4h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
|
|
mov v18.d[1],v19.d[0]
|
|
sqshrun v20.8b, v18.8h,#7
|
|
st1 {v20.s}[0],[x2],#4 //store pu1_dst i iteration
|
|
st1 {v20.s}[1],[x14],x5 //store pu1_dst ii iteration
|
|
|
|
subs x9,x9,#4 //decrement wd by 4 and check for 0
|
|
|
|
bgt core_loop_chroma_4x2 //if greater than 0 repeat the core loop again
|
|
|
|
end_core_loop_chorma_4x2:
|
|
|
|
subs x8,x8,#2 //decrement the ht by 4
|
|
|
|
add x0,x0,x7 //pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
|
|
asr x9,x6,#1
|
|
add x1,x1,x10 //pi2_src2 + 2*src_strd2 - 2*wd
|
|
sub x20,x9,x5,lsl #1 //2*dst_strd - wd
|
|
neg x14, x20
|
|
add x2,x2,x14
|
|
//pu1_dst + dst_std - wd
|
|
bgt core_loop_chroma_4x2 //if ht is greater than 0 goto outer_loop
|
|
|
|
b end_loops
|
|
|
|
|
|
|
|
outer_loop_8:
|
|
cmp x9,#0 //check wd == 0
|
|
beq end_loops //if equal, then end the function
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
core_loop_8:
|
|
|
|
ld1 { v24.8h},[x0],#16 //load and increment the pi2_src1
|
|
add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
|
|
ld1 { v26.8h},[x1],#16 //load and increment the pi2_src2
|
|
sqadd v24.8h,v24.8h,v26.8h
|
|
ld1 { v28.8h},[x11],x3 //load and increment the pi2_src1 ii iteration
|
|
sqadd v24.8h,v24.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
|
|
ld1 { v30.8h},[x12],x4 //load and increment the pi2_src2 ii iteration
|
|
ld1 { v16.8h},[x11],x3 //load and increment the pi2_src1 iii iteration
|
|
sqadd v22.8h,v28.8h,v30.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
|
|
ld1 { v18.8h},[x12],x4 //load and increment the pi2_src2 iii iteration
|
|
sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
|
|
sqshrun v20.8b, v24.8h,#7
|
|
ld1 { v17.8h},[x11],x3 //load and increment the pi2_src1 iv iteration
|
|
sqadd v30.8h,v16.8h,v18.8h
|
|
sqshrun v21.8b, v22.8h,#7
|
|
ld1 { v29.8h},[x12],x4 //load and increment the pi2_src2 iv iteration
|
|
sqadd v30.8h,v30.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
|
|
st1 {v20.2s},[x2],#8 //store pu1_dst i iteration
|
|
sqadd v1.8h,v17.8h,v29.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
|
|
st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration
|
|
sqadd v1.8h,v1.8h,v0.8h
|
|
sqshrun v30.8b, v30.8h,#7
|
|
sqshrun v31.8b, v1.8h,#7
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
st1 {v30.2s},[x14],x5 //store pu1_dst iii iteration //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
|
|
subs x9,x9,#8 //decrement wd by 4 and check for 0
|
|
st1 {v31.2s},[x14],x5 //store pu1_dst iv iteration
|
|
bgt core_loop_8 //if greater than 0 repeat the core loop again
|
|
|
|
end_core_loop_8:
|
|
|
|
subs x8,x8,#4 //decrement the ht by 4
|
|
|
|
add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
|
|
asr x9,x6,#1
|
|
add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd
|
|
sub x20,x9,x5,lsl #2 //4*dst_strd - wd
|
|
neg x14, x20
|
|
add x2,x2,x14
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) //pu1_dst + dst_std - wd
|
|
|
|
bgt core_loop_8
|
|
b end_loops
|
|
|
|
|
|
|
|
// this is only for chroma module with inpput 4x2
|
|
outer_loop_chroma_8x2:
|
|
cmp x9,#0 //check wd == 0
|
|
beq end_loops //if equal, then end the function
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
sub x20,x6,x3,lsl #1 //2*src_strd1 - wd
|
|
neg x7, x20
|
|
sub x20,x6,x4,lsl #1 //2*src_strd2 - wd
|
|
neg x10, x20
|
|
core_loop_chroma_8x2:
|
|
|
|
ld1 { v24.8h},[x0],#16 //load and increment the pi2_src1
|
|
add x14,x2,x5 //pu1_dst_tmp = pu1_dst + dst_strd
|
|
ld1 { v26.8h},[x1],#16 //load and increment the pi2_src2
|
|
sqadd v24.8h,v24.8h,v26.8h
|
|
ld1 { v28.8h},[x11],x3 //load and increment the pi2_src1 ii iteration
|
|
sqadd v24.8h,v24.8h,v0.8h //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
|
|
ld1 { v30.8h},[x12],x4 //load and increment the pi2_src2 ii iteration
|
|
ld1 { v16.8h},[x11],x3 //load and increment the pi2_src1 iii iteration
|
|
sqadd v22.8h,v28.8h,v30.8h //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
|
|
sqadd v22.8h,v22.8h,v0.8h //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
|
|
sqshrun v20.8b, v24.8h,#7
|
|
sqshrun v21.8b, v22.8h,#7
|
|
st1 {v20.2s},[x2],#8 //store pu1_dst i iteration
|
|
st1 {v21.2s},[x14],x5 //store pu1_dst ii iteration
|
|
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
//vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
|
|
subs x9,x9,#8 //decrement wd by 4 and check for 0
|
|
|
|
bgt core_loop_chroma_8x2 //if greater than 0 repeat the core loop again
|
|
|
|
end_core_loop_chroma_8x2:
|
|
|
|
subs x8,x8,#2 //decrement the ht by 4
|
|
|
|
add x0,x0,x7 //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
|
|
asr x9,x6,#1
|
|
add x1,x1,x10 //pi2_src2 + 4*src_strd2 - 2*wd
|
|
sub x20,x9,x5,lsl #1 //4*dst_strd - wd
|
|
neg x14, x20
|
|
add x2,x2,x14
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer) //pu1_dst + dst_std - wd
|
|
|
|
bgt core_loop_chroma_8x2
|
|
|
|
b end_loops
|
|
|
|
|
|
|
|
|
|
outer_loop_16:
|
|
cmp x9,#0 //check wd == 0
|
|
beq end_loops //if equal, then end the function
|
|
add x11,x0,x3 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
|
|
add x12,x1,x4 //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
|
|
sub x20,x6,x3,lsl #1 //2*src_strd1 - wd
|
|
neg x7, x20
|
|
mov x14,#16
|
|
sub x10,x14,x5
|
|
sub x11,x3,x14
|
|
sub x12,x14,x3
|
|
|
|
sub x20,x9,x5,lsl #1 //2*dst_strd - wd
|
|
neg x14, x20
|
|
|
|
|
|
|
|
prolog_16:
|
|
|
|
|
|
ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1
|
|
ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
|
|
ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1
|
|
ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2
|
|
ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
|
|
subs x9,x9,#16
|
|
ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
|
|
sub x20,x8,#2
|
|
csel x8, x20, x8,eq
|
|
sqadd v22.8h,v2.8h,v4.8h
|
|
ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
|
|
sqadd v28.8h,v5.8h,v17.8h
|
|
ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
|
|
add x20,x0,x7
|
|
csel x0, x20, x0,eq
|
|
add x20,x1,x7
|
|
csel x1, x20, x1,eq
|
|
sqadd v24.8h,v6.8h,v1.8h
|
|
ld1 { v2.8h},[x0],#16
|
|
sqadd v26.8h,v29.8h,v16.8h
|
|
// if the input is chroma with 8x2 block size
|
|
cmp x8,#0
|
|
beq epilog_16
|
|
|
|
ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
|
|
sqadd v22.8h,v22.8h,v0.8h
|
|
ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1
|
|
sqadd v28.8h,v28.8h,v0.8h
|
|
ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2
|
|
sqadd v24.8h,v24.8h,v0.8h
|
|
ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
|
|
sqadd v30.8h,v26.8h,v0.8h
|
|
sqshrun v20.8b, v22.8h,#7
|
|
ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
|
|
sqshrun v21.8b, v28.8h,#7
|
|
ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
|
|
sqshrun v26.8b, v24.8h,#7
|
|
ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
|
|
sqshrun v27.8b, v30.8h,#7
|
|
|
|
|
|
|
|
core_loop_16:
|
|
|
|
cmp x9,#0
|
|
sqadd v22.8h,v2.8h,v4.8h
|
|
asr x20,x6,#1
|
|
csel x9,x20,x9,eq
|
|
//asreq x9,x6,#1
|
|
mov v20.d[1],v21.d[0]
|
|
mov v26.d[1],v27.d[0]
|
|
st1 { v20.4s},[x2],x5
|
|
sqadd v28.8h,v5.8h,v17.8h
|
|
st1 { v26.4s},[x2],x10
|
|
add x20,x2,x14
|
|
csel x2, x20, x2,eq
|
|
sqadd v24.8h,v6.8h,v1.8h
|
|
subs x9,x9,#16
|
|
add x20,x0,x7
|
|
csel x0, x20, x0,eq
|
|
sqadd v26.8h,v29.8h,v16.8h
|
|
|
|
add x20,x1,x7
|
|
csel x1, x20, x1,eq
|
|
sub x20,x8,#2
|
|
csel x8,x20,x8,eq
|
|
cmp x8,#0
|
|
//subeqs x8,x8,#2 //decrement the ht by 2
|
|
beq epilog_16
|
|
|
|
|
|
sqadd v22.8h,v22.8h,v0.8h
|
|
ld1 { v2.8h},[x0],#16 //load and increment the pi2_src1
|
|
sqadd v28.8h,v28.8h,v0.8h
|
|
ld1 { v4.8h},[x1],#16 //load and increment the pi2_src2
|
|
sqadd v24.8h,v24.8h,v0.8h
|
|
ld1 { v5.8h},[x0],x11 //load and increment the pi2_src1
|
|
sqadd v30.8h,v26.8h,v0.8h
|
|
ld1 { v17.8h},[x1],x11 //load and increment the pi2_src2
|
|
sqshrun v20.8b, v22.8h,#7
|
|
ld1 { v6.8h},[x0],#16 //load and increment the pi2_src1 ii iteration
|
|
sqshrun v21.8b, v28.8h,#7
|
|
ld1 { v1.8h},[x1],#16 //load and increment the pi2_src2 ii iteration
|
|
sqshrun v26.8b, v24.8h,#7
|
|
ld1 { v29.8h},[x0],x12 //load and increment the pi2_src1 ii iteration
|
|
sqshrun v27.8b, v30.8h,#7
|
|
ld1 { v16.8h},[x1],x12 //load and increment the pi2_src2 ii iteration
|
|
|
|
|
|
b core_loop_16
|
|
|
|
|
|
epilog_16:
|
|
|
|
sqadd v22.8h,v22.8h,v0.8h
|
|
sqadd v28.8h,v28.8h,v0.8h
|
|
sqadd v24.8h,v24.8h,v0.8h
|
|
sqadd v30.8h,v26.8h,v0.8h
|
|
sqshrun v20.8b, v22.8h,#7
|
|
sqshrun v21.8b, v28.8h,#7
|
|
sqshrun v26.8b, v24.8h,#7
|
|
sqshrun v27.8b, v30.8h,#7
|
|
mov v20.d[1],v21.d[0]
|
|
mov v26.d[1],v27.d[0]
|
|
st1 { v20.4s},[x2],x5
|
|
st1 { v26.4s},[x2]
|
|
|
|
|
|
|
|
end_core_loop_16:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
end_loops:
|
|
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
|
|
ldp x21, x22,[sp],#16
|
|
ldp x19, x20,[sp],#16
|
|
|
|
ret
|
|
|
|
|
|
|
|
|