android13/external/libhevc/common/arm/ihevc_weighted_pred_uni.s

@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_weighted_pred_uni.s
@*
@* @brief
@*  contains function definitions for weighted prediction used in inter
@* prediction
@*
@* @author
@*  parthiban v
@*
@* @par list of functions:
@*  - ihevc_weighted_pred_uni()
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@/**
@*******************************************************************************
@*
@* @brief
@*  does uni-weighted prediction on the array pointed by  pi2_src and stores
@* it at the location pointed by pi2_dst assumptions : the function is
@* optimized considering the fact width and  height are multiple of 2.
@*
@* @par description:
@*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
@* offset
@*
@* @param[in] pi2_src
@*  pointer to the source
@*
@* @param[out] pu1_dst
@*  pointer to the destination
@*
@* @param[in] src_strd
@*  source stride
@*
@* @param[in] dst_strd
@*  destination stride
@*
@* @param[in] wgt0
@*  weight to be multiplied to the source
@*
@* @param[in] off0
@*  offset to be added after rounding and
@*
@* @param[in] shifting
@*
@*
@* @param[in] shift
@*  (14 bit depth) + log2_weight_denominator
@*
@* @param[in] lvl_shift
@*  added before shift and offset
@*
@* @param[in] ht
@*  height of the source
@*
@* @param[in] wd
@*  width of the source
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_weighted_pred_uni(word16 *pi2_src,
@                             uword8 *pu1_dst,
@                             word32 src_strd,
@                             word32 dst_strd,
@                             word32 wgt0,
@                             word32 off0,
@                             word32 shift,
@                             word32 lvl_shift,
@                             word32 ht,
@                             word32 wd)

@**************variables vs registers*****************************************
@   r0 => *pi2_src
@   r1 => *pu1_dst
@   r2 =>  src_strd
@   r3 =>  dst_strd
@   r4 =>  wgt0
@   r5 =>  off0
@   r6 =>  shift
@   r7 =>  lvl_shift
@   r8 =>   ht
@   r9  =>  wd

.equ    wgt0_offset,        104
.equ    off0_offset,        108
.equ    shift_offset,       112
.equ    lvl_shift_offset,   116
.equ    ht_offset,          120
.equ    wd_offset,          124

.text
.align 4


.globl ihevc_weighted_pred_uni_a9q

.type ihevc_weighted_pred_uni_a9q, %function

ihevc_weighted_pred_uni_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    vpush       {d8  -  d15}

    ldr         r4,[sp,#wgt0_offset]        @load wgt0
    ldr         r7,[sp,#lvl_shift_offset]   @load lvl_shift
    mov         r11,#1
    ldr         r5,[sp,#off0_offset]        @load off0
    mul         r10,r7,r4                   @lvl_shift * wgt0
    ldr         r6,[sp,#shift_offset]       @load shift
    ldr         r8,[sp,#ht_offset]          @load ht
    add         r10,r10,r5,lsl r6           @lvl_shift * wgt0 + (off0 << shift)
    ldr         r9,[sp,#wd_offset]          @load wt
    sub         r12,r6,#1
    vmov.s16    d0[0],r4                    @moved for scalar multiplication
    lsl         r2,r2,#1
    vdup.u32    q14,r6                      @vmovq_n_s32(tmp_shift)
    add         r10,r10,r11,lsl r12         @tmp_lvl_shift += (1 << (shift - 1))
    vdup.s32    q15,r10                     @vmovq_n_s32(tmp_lvl_shift)
    vneg.s32    q14,q14
    lsl         r4,r9,#1

    cmp         r8,#0                       @check ht == 0
    beq         end_loops                   @if equal, then end the function

outer_loop:
    cmp         r9,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function

core_loop:
    add         r5,r0,r2                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r6,r1,r3                    @pu1_dst_tmp = pu1_dst + dst_strd
    vld1.s16    {d1},[r0]!                  @load and increment the pi2_src
    vld1.s16    {d2},[r5],r2                @load and increment the pi2_src_tmp ii iteration
    vmull.s16   q2,d1,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)

    vadd.i32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
    vld1.s16    {d8},[r5],r2                @load and increment the pi2_src iii iteration

    vmull.s16   q3,d2,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
    vld1.s16    {d9},[r5],r2                @load and increment the pi2_src_tmp iv iteration

    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t)
    vadd.i32    q3,q3,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration

    vmull.s16   q5,d8,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)

    vadd.i32    q5,q5,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)

    vshl.s32    q3,q3,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration

    vmull.s16   q6,d9,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)

    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
    vqmovun.s32 d6,q3                       @vqmovun_s32(sto_res_tmp1) ii iteration

    vadd.i32    q6,q6,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
    vmov.s32    d7,d6                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration

    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) iii iteration

    vshl.s32    q6,q6,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
    vst1.32     {d4[0]},[r1]!               @store pu1_dst i iteration
    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration

    vqmovn.u16  d6,q3                       @vqmovn_u16(sto_res_tmp3) ii iteration
    vst1.32     {d6[0]},[r6],r3             @store pu1_dst ii iteration

    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) iii iteration
    vqmovun.s32 d12,q6                      @vqmovun_s32(sto_res_tmp1) iv iteration

    vmov.s32    d13,d12                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
    vst1.32     {d10[0]},[r6],r3            @store pu1_dst i iteration iii iteration
    vqmovn.u16  d12,q6                      @vqmovn_u16(sto_res_tmp3) iv iteration

    subs        r9,r9,#4                    @decrement wd by 4 and check for 0
    vst1.32     {d12[0]},[r6],r3            @store pu1_dst iv iteration
    bgt         core_loop                   @if greater than 0 repeat the core loop again

end_core_loop:
    rsb         r11,r4,r2,lsl #2            @2*src_strd - wd
    subs        r8,r8,#4                    @decrement the ht by 4
    add         r0,r0,r11                   @pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
    asr         r9,r4,#1
    rsb         r12,r9,r3,lsl #2            @2*dst_strd - wd
    add         r1,r1,r12                   @pu1_dst + dst_std - wd
    bgt         core_loop                   @if ht is greater than 0 goto outer_loop

end_loops:
    vpop        {d8  -  d15}
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
initial 2024-06-22 08:45:49 -04:00			`@/*****************************************************************************`
			`@*`
			`@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore`
			`@*`
			`@* Licensed under the Apache License, Version 2.0 (the "License");`
			`@* you may not use this file except in compliance with the License.`
			`@* You may obtain a copy of the License at:`
			`@*`
			`@* http://www.apache.org/licenses/LICENSE-2.0`
			`@*`
			`@* Unless required by applicable law or agreed to in writing, software`
			`@* distributed under the License is distributed on an "AS IS" BASIS,`
			`@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`@* See the License for the specific language governing permissions and`
			`@* limitations under the License.`
			`@*`
			`@*****************************************************************************/`
			`@/**`
			`@*******************************************************************************`
			`@* @file`
			`@* ihevc_weighted_pred_uni.s`
			`@*`
			`@* @brief`
			`@* contains function definitions for weighted prediction used in inter`
			`@* prediction`
			`@*`
			`@* @author`
			`@* parthiban v`
			`@*`
			`@* @par list of functions:`
			`@* - ihevc_weighted_pred_uni()`
			`@*`
			`@* @remarks`
			`@* none`
			`@*`
			`@*******************************************************************************`
			`@*/`

			`@/**`
			`@*******************************************************************************`
			`@*`
			`@* @brief`
			`@* does uni-weighted prediction on the array pointed by pi2_src and stores`
			`@* it at the location pointed by pi2_dst assumptions : the function is`
			`@* optimized considering the fact width and height are multiple of 2.`
			`@*`
			`@* @par description:`
			`@* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +`
			`@* offset`
			`@*`
			`@* @param[in] pi2_src`
			`@* pointer to the source`
			`@*`
			`@* @param[out] pu1_dst`
			`@* pointer to the destination`
			`@*`
			`@* @param[in] src_strd`
			`@* source stride`
			`@*`
			`@* @param[in] dst_strd`
			`@* destination stride`
			`@*`
			`@* @param[in] wgt0`
			`@* weight to be multiplied to the source`
			`@*`
			`@* @param[in] off0`
			`@* offset to be added after rounding and`
			`@*`
			`@* @param[in] shifting`
			`@*`
			`@*`
			`@* @param[in] shift`
			`@* (14 bit depth) + log2_weight_denominator`
			`@*`
			`@* @param[in] lvl_shift`
			`@* added before shift and offset`
			`@*`
			`@* @param[in] ht`
			`@* height of the source`
			`@*`
			`@* @param[in] wd`
			`@* width of the source`
			`@*`
			`@* @returns`
			`@*`
			`@* @remarks`
			`@* none`
			`@*`
			`@*******************************************************************************`
			`@*/`

			`@void ihevc_weighted_pred_uni(word16 *pi2_src,`
			`@ uword8 *pu1_dst,`
			`@ word32 src_strd,`
			`@ word32 dst_strd,`
			`@ word32 wgt0,`
			`@ word32 off0,`
			`@ word32 shift,`
			`@ word32 lvl_shift,`
			`@ word32 ht,`
			`@ word32 wd)`

			`@************variables vs registers***************************************`
			`@ r0 => *pi2_src`
			`@ r1 => *pu1_dst`
			`@ r2 => src_strd`
			`@ r3 => dst_strd`
			`@ r4 => wgt0`
			`@ r5 => off0`
			`@ r6 => shift`
			`@ r7 => lvl_shift`
			`@ r8 => ht`
			`@ r9 => wd`

			`.equ wgt0_offset, 104`
			`.equ off0_offset, 108`
			`.equ shift_offset, 112`
			`.equ lvl_shift_offset, 116`
			`.equ ht_offset, 120`
			`.equ wd_offset, 124`

			`.text`
			`.align 4`




			`.globl ihevc_weighted_pred_uni_a9q`

			`.type ihevc_weighted_pred_uni_a9q, %function`

			`ihevc_weighted_pred_uni_a9q:`

			`stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments`
			`vpush {d8 - d15}`

			`ldr r4,[sp,#wgt0_offset] @load wgt0`
			`ldr r7,[sp,#lvl_shift_offset] @load lvl_shift`
			`mov r11,#1`
			`ldr r5,[sp,#off0_offset] @load off0`
			`mul r10,r7,r4 @lvl_shift * wgt0`
			`ldr r6,[sp,#shift_offset] @load shift`
			`ldr r8,[sp,#ht_offset] @load ht`
			`add r10,r10,r5,lsl r6 @lvl_shift * wgt0 + (off0 << shift)`
			`ldr r9,[sp,#wd_offset] @load wt`
			`sub r12,r6,#1`
			`vmov.s16 d0[0],r4 @moved for scalar multiplication`
			`lsl r2,r2,#1`
			`vdup.u32 q14,r6 @vmovq_n_s32(tmp_shift)`
			`add r10,r10,r11,lsl r12 @tmp_lvl_shift += (1 << (shift - 1))`
			`vdup.s32 q15,r10 @vmovq_n_s32(tmp_lvl_shift)`
			`vneg.s32 q14,q14`
			`lsl r4,r9,#1`

			`cmp r8,#0 @check ht == 0`
			`beq end_loops @if equal, then end the function`

			`outer_loop:`
			`cmp r9,#0 @check wd == 0`
			`beq end_loops @if equal, then end the function`

			`core_loop:`
			`add r5,r0,r2 @pi2_src_tmp1 = pi2_src1 + 2src_strd1(2 because pi1_src is a 16 bit pointer)`
			`add r6,r1,r3 @pu1_dst_tmp = pu1_dst + dst_strd`
			`vld1.s16 {d1},[r0]! @load and increment the pi2_src`
			`vld1.s16 {d2},[r5],r2 @load and increment the pi2_src_tmp ii iteration`
			`vmull.s16 q2,d1,d0[0] @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)`

			`vadd.i32 q2,q2,q15 @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)`
			`vld1.s16 {d8},[r5],r2 @load and increment the pi2_src iii iteration`

			`vmull.s16 q3,d2,d0[0] @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration`
			`vld1.s16 {d9},[r5],r2 @load and increment the pi2_src_tmp iv iteration`

			`vshl.s32 q2,q2,q14 @vshlq_s32(i4_tmp1_t, tmp_shift_t)`
			`vadd.i32 q3,q3,q15 @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration`

			`vmull.s16 q5,d8,d0[0] @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration`
			`vqmovun.s32 d4,q2 @vqmovun_s32(sto_res_tmp1)`

			`vadd.i32 q5,q5,q15 @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration`
			`vmov.s32 d5,d4 @vcombine_u16(sto_res_tmp2, sto_res_tmp2)`

			`vshl.s32 q3,q3,q14 @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration`

			`vmull.s16 q6,d9,d0[0] @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration`
			`vqmovn.u16 d4,q2 @vqmovn_u16(sto_res_tmp3)`

			`vshl.s32 q5,q5,q14 @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration`
			`vqmovun.s32 d6,q3 @vqmovun_s32(sto_res_tmp1) ii iteration`

			`vadd.i32 q6,q6,q15 @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration`
			`vmov.s32 d7,d6 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration`

			`vqmovun.s32 d10,q5 @vqmovun_s32(sto_res_tmp1) iii iteration`

			`vshl.s32 q6,q6,q14 @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration`
			`vst1.32 {d4[0]},[r1]! @store pu1_dst i iteration`
			`vmov.s32 d11,d10 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration`

			`vqmovn.u16 d6,q3 @vqmovn_u16(sto_res_tmp3) ii iteration`
			`vst1.32 {d6[0]},[r6],r3 @store pu1_dst ii iteration`

			`vqmovn.u16 d10,q5 @vqmovn_u16(sto_res_tmp3) iii iteration`
			`vqmovun.s32 d12,q6 @vqmovun_s32(sto_res_tmp1) iv iteration`

			`vmov.s32 d13,d12 @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration`
			`vst1.32 {d10[0]},[r6],r3 @store pu1_dst i iteration iii iteration`
			`vqmovn.u16 d12,q6 @vqmovn_u16(sto_res_tmp3) iv iteration`

			`subs r9,r9,#4 @decrement wd by 4 and check for 0`
			`vst1.32 {d12[0]},[r6],r3 @store pu1_dst iv iteration`
			`bgt core_loop @if greater than 0 repeat the core loop again`

			`end_core_loop:`
			`rsb r11,r4,r2,lsl #2 @2*src_strd - wd`
			`subs r8,r8,#4 @decrement the ht by 4`
			`add r0,r0,r11 @pi2_src + 4src_strd - 2wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)`
			`asr r9,r4,#1`
			`rsb r12,r9,r3,lsl #2 @2*dst_strd - wd`
			`add r1,r1,r12 @pu1_dst + dst_std - wd`
			`bgt core_loop @if ht is greater than 0 goto outer_loop`

			`end_loops:`
			`vpop {d8 - d15}`
			`ldmfd sp!,{r4-r12,r15} @reload the registers from sp`