520 lines
15 KiB
ArmAsm
520 lines
15 KiB
ArmAsm
///*****************************************************************************
|
|
//*
|
|
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************/
|
|
///**
|
|
//*******************************************************************************
|
|
//* @file
|
|
//* ihevc_intra_pred_filters_dc.s
|
|
//*
|
|
//* @brief
|
|
//* contains function definitions for intra prediction dc filtering.
|
|
//* functions are coded using neon intrinsics and can be compiled using
|
|
|
|
//* rvct
|
|
//*
|
|
//* @author
|
|
//* akshaya mukund
|
|
//*
|
|
//* @par list of functions:
|
|
//*
|
|
//*
|
|
//* @remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* luma intraprediction filter for dc input
|
|
//*
|
|
//* @par description:
|
|
//*
|
|
//* @param[in] pu1_ref
|
|
//* uword8 pointer to the source
|
|
//*
|
|
//* @param[out] pu1_dst
|
|
//* uword8 pointer to the destination
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] dst_strd
|
|
//* integer destination stride
|
|
//*
|
|
//* @param[in] pi1_coeff
|
|
//* word8 pointer to the planar coefficients
|
|
//*
|
|
//* @param[in] nt
|
|
//* size of tranform block
|
|
//*
|
|
//* @param[in] mode
|
|
//* type of filtering
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
|
|
//void ihevc_intra_pred_luma_dc(uword8 *pu1_ref,
|
|
// word32 src_strd,
|
|
// uword8 *pu1_dst,
|
|
// word32 dst_strd,
|
|
// word32 nt,
|
|
// word32 mode)
|
|
//
|
|
//**************variables vs registers*****************************************
|
|
//x0 => *pu1_ref
|
|
//x1 => src_strd
|
|
//x2 => *pu1_dst
|
|
//x3 => dst_strd
|
|
|
|
//stack contents from #40
|
|
// nt
|
|
// mode
|
|
// pi1_coeff
|
|
|
|
.text
|
|
.align 4
|
|
.include "ihevc_neon_macros.s"
|
|
|
|
|
|
.globl ihevc_intra_pred_luma_dc_av8
|
|
|
|
.type ihevc_intra_pred_luma_dc_av8, %function
|
|
|
|
ihevc_intra_pred_luma_dc_av8:
|
|
|
|
// stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments
|
|
|
|
stp x19, x20,[sp,#-16]!
|
|
|
|
|
|
//********** testing
|
|
//mov x6, #128
|
|
//b prologue_cpy_32
|
|
//********** testing
|
|
|
|
mov x11, #2 //mov #2 to x11 (to be used to add to 2dc_val & 3dc_val)
|
|
mov x9, #0
|
|
mov v17.s[0], w11
|
|
mov v17.s[1], w9
|
|
|
|
clz w5,w4
|
|
|
|
add x6, x0, x4 //&src[nt]
|
|
sub x20, x5, #32 //log2nt
|
|
neg x5, x20
|
|
add x7, x0, x4, lsl #1 //&src[2nt]
|
|
|
|
add x8, x7, #1 //&src[2nt+1]
|
|
mvn x5, x5
|
|
add x5, x5, #1
|
|
dup v7.2s,w5
|
|
|
|
ldrb w14, [x8]
|
|
sxtw x14,w14
|
|
shl d7, d7,#32
|
|
|
|
sub x9, x7, #1 //&src[2nt-1]
|
|
sshr d7, d7,#32
|
|
|
|
mov x7, x8 //x7 also stores 2nt+1
|
|
|
|
ldrb w12, [x9]
|
|
sxtw x12,w12
|
|
add x14, x14, x12 //src[2nt+1] + src[2nt-1]
|
|
add x14, x14, x11 //src[2nt+1] + src[2nt-1] + 2
|
|
|
|
cmp x4, #4
|
|
beq dc_4
|
|
|
|
mov x10, x4 //nt
|
|
|
|
add_loop:
|
|
ld1 {v0.8b},[x6],#8 //load from src[nt]
|
|
mov x5, #0 //
|
|
ld1 {v1.8b},[x8],#8 //load from src[2nt+1]
|
|
|
|
uaddlp v2.4h, v0.8b
|
|
|
|
mov v6.s[0], w4
|
|
mov v6.s[1], w5 //store nt to accumulate
|
|
uaddlp v3.4h, v1.8b
|
|
|
|
ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 8)
|
|
|
|
ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 8)
|
|
add v4.4h, v2.4h , v3.4h
|
|
|
|
|
|
uaddlp v5.2s, v4.4h
|
|
|
|
|
|
uadalp v6.1d, v5.2s //accumulate all inp into d6 (end for nt==8)
|
|
|
|
subs x10, x10,#8
|
|
beq epil_add_loop
|
|
|
|
core_loop_add:
|
|
uaddlp v2.4h, v0.8b
|
|
subs x10, x10,#8
|
|
uaddlp v3.4h, v1.8b
|
|
|
|
|
|
|
|
add v4.4h, v2.4h , v3.4h
|
|
ld1 {v0.8b},[x6],#8 //load from src[nt] (extra load for 16)
|
|
|
|
uaddlp v5.2s, v4.4h
|
|
ld1 {v1.8b},[x8],#8 //load from src[2nt+1] (extra load for 16)
|
|
|
|
uadalp v6.1d, v5.2s //accumulate all inp into d6
|
|
bne core_loop_add
|
|
|
|
epil_add_loop:
|
|
|
|
sshl d18, d6, d7 //(dc_val) shr by log2nt+1
|
|
cmp x4, #32
|
|
|
|
mov v28.s[0], w14
|
|
mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
|
|
mov x20,#128
|
|
csel x6, x20, x6,eq
|
|
|
|
dup v16.8b, v18.b[0] //dc_val
|
|
shl d25, d18,#1 //2*dc
|
|
|
|
beq prologue_cpy_32
|
|
|
|
add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
|
|
mov x20,#0
|
|
csel x6, x20, x6,ne //nt
|
|
|
|
ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
|
|
csel x10, x4, x10,ne
|
|
|
|
add d23, d25 , d18 //3*dc
|
|
sub x12, x3, x3, lsl #3 //-7*strd
|
|
|
|
add d23, d23 , d17 //3*dc + 2
|
|
add x12, x12, #8 //offset after one 8x8 block (-7*strd + 8)
|
|
|
|
dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes)
|
|
sub x0, x3, x4 //strd - nt
|
|
|
|
prologue_col:
|
|
//0th column and 0-7 rows done here
|
|
//x8 and x9 (2nt+1+col 2nt-1-row)
|
|
|
|
mov x8, x7 //&src[2nt+1]
|
|
|
|
add x0, x0, #8 //strd - nt + 8
|
|
ld1 {v0.8b},[x8],#8 //col 1::7 load (prol)
|
|
sub x9, x9, #7 //&src[2nt-1-row]
|
|
|
|
ld1 {v1.8b},[x9] //row 7::1 (0 also) load (prol)
|
|
sub x9, x9, #8
|
|
|
|
uxtl v20.8h, v0.8b
|
|
|
|
ld1 {v6.8b},[x8] //col 8::15 load (prol extra)
|
|
add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol)
|
|
|
|
uxtl v22.8h, v1.8b
|
|
sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol)
|
|
|
|
uxtl v26.8h, v6.8b
|
|
add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol)
|
|
|
|
movi d19, #0x00000000000000ff //
|
|
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
|
|
|
|
bsl v19.8b, v29.8b , v2.8b //first row with dst[0]
|
|
add v26.8h, v26.8h , v24.8h //col 8::15 add 3dc+2 (prol extra)
|
|
|
|
rev64 v3.8b, v3.8b
|
|
|
|
st1 {v19.8b},[x2], x3 //store row 0 (prol)
|
|
sshr d3, d3,#8 //row 0 shift (prol) (first value to be ignored)
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 1 (prol)
|
|
|
|
loop_again_col_row:
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 1 (prol)
|
|
|
|
movi d21, #0x00000000000000ff //byte mask row 2 (prol)
|
|
sshr d3, d3,#8 //row 1 shift (prol)
|
|
|
|
st1 {v20.8b},[x2], x3 //store row 1 (prol)
|
|
sqshrun v4.8b, v26.8h,#2 //columns shx2 movn (prol extra)
|
|
|
|
|
|
bsl v21.8b, v3.8b , v16.8b //row 2 (prol)
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 3 (prol)
|
|
sshr d3, d3,#8 //row 2 shift (prol)
|
|
|
|
st1 {v21.8b},[x2], x3 //store row 2 (prol)
|
|
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 3 (prol)
|
|
|
|
movi d21, #0x00000000000000ff //byte mask row 4 (prol)
|
|
sshr d3, d3,#8 //row 3 shift (prol)
|
|
|
|
st1 {v20.8b},[x2], x3 //store row 3 (prol)
|
|
|
|
|
|
bsl v21.8b, v3.8b , v16.8b //row 4 (prol)
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 5 (prol)
|
|
sshr d3, d3,#8 //row 4 shift (prol)
|
|
|
|
st1 {v21.8b},[x2], x3 //store row 4 (prol)
|
|
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 5 (prol)
|
|
|
|
movi d21, #0x00000000000000ff //byte mask row 6 (prol)
|
|
sshr d3, d3,#8 //row 5 shift (prol)
|
|
|
|
st1 {v20.8b},[x2], x3 //store row 5 (prol)
|
|
|
|
ld1 {v1.8b},[x9] //row 8::15 load (prol extra)
|
|
|
|
bsl v21.8b, v3.8b , v16.8b //row 6 (prol)
|
|
|
|
uxtl v22.8h, v1.8b
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 7 (prol)
|
|
sshr d3, d3,#8 //row 6 shift (prol)
|
|
|
|
st1 {v21.8b},[x2], x3 //store row 6 (prol)
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 7 (prol)
|
|
add v22.8h, v22.8h , v24.8h //row 8::15 add 3dc+2 (prol extra)
|
|
|
|
sshr d3, d3,#8 //row 7 shift (prol)
|
|
st1 {v20.8b},[x2], x12 //store row 7 (prol)
|
|
|
|
subs x10, x10, #8 //counter for cols
|
|
|
|
beq end_func
|
|
blt copy_16
|
|
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 9 (prol)
|
|
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
|
|
|
|
rev64 v3.8b, v3.8b
|
|
|
|
st1 {v4.8b},[x2], x3 //store 2nd col (for 16x16)
|
|
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x0 //go to next row for 16
|
|
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 9 (prol)
|
|
subs x10, x10, #8
|
|
|
|
st1 {v20.8b},[x2], x3 //store row 9 (prol)
|
|
sshr d3, d3,#8 //row 9 shift (prol)
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 9 (prol)
|
|
|
|
b loop_again_col_row
|
|
|
|
|
|
copy_16:
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2], x3
|
|
st1 {v16.8b},[x2]
|
|
|
|
b end_func
|
|
|
|
prologue_cpy_32:
|
|
mov x9, #128
|
|
//sub x7, x3, #-24
|
|
add x5, x2, x3
|
|
add x8, x5, x3
|
|
add x10, x8, x3
|
|
dup v20.16b, v16.b[0]
|
|
lsl x6, x3, #2
|
|
sub x6, x6, #16
|
|
|
|
st1 {v20.16b}, [x2],#16
|
|
st1 {v20.16b}, [x5],#16
|
|
st1 {v20.16b}, [x8],#16
|
|
st1 {v20.16b}, [x10],#16
|
|
|
|
st1 {v20.16b}, [x2], x6
|
|
st1 {v20.16b}, [x5], x6
|
|
st1 {v20.16b}, [x8], x6
|
|
st1 {v20.16b}, [x10], x6
|
|
|
|
sub x9, x9, #32 //32x32 prol/epil counter dec
|
|
|
|
kernel_copy:
|
|
st1 {v20.16b}, [x2],#16
|
|
st1 {v20.16b}, [x5],#16
|
|
st1 {v20.16b}, [x8],#16
|
|
st1 {v20.16b}, [x10],#16
|
|
|
|
st1 {v20.16b}, [x2], x6
|
|
st1 {v20.16b}, [x5], x6
|
|
st1 {v20.16b}, [x8], x6
|
|
st1 {v20.16b}, [x10], x6
|
|
|
|
subs x9, x9, #32
|
|
|
|
st1 {v20.16b}, [x2],#16
|
|
st1 {v20.16b}, [x5],#16
|
|
st1 {v20.16b}, [x8],#16
|
|
st1 {v20.16b}, [x10],#16
|
|
|
|
st1 {v20.16b}, [x2], x6
|
|
st1 {v20.16b}, [x5], x6
|
|
st1 {v20.16b}, [x8], x6
|
|
st1 {v20.16b}, [x10], x6
|
|
|
|
bne kernel_copy
|
|
|
|
epilogue_copy:
|
|
st1 {v20.16b}, [x2],#16
|
|
st1 {v20.16b}, [x5],#16
|
|
st1 {v20.16b}, [x8],#16
|
|
st1 {v20.16b}, [x10],#16
|
|
|
|
st1 {v20.16b}, [x2]
|
|
st1 {v20.16b}, [x5]
|
|
st1 {v20.16b}, [x8]
|
|
st1 {v20.16b}, [x10]
|
|
|
|
b end_func
|
|
|
|
|
|
dc_4:
|
|
ld1 {v0.8b},[x6],#8 //load from src[nt]
|
|
ld1 {v1.8b},[x8],#8 //load from src[2nt+1]
|
|
|
|
uaddlp v2.4h, v0.8b
|
|
mov x5, #0 //
|
|
mov v6.s[0], w4
|
|
mov v6.s[1], w5 //store nt to accumulate
|
|
uaddlp v3.4h, v1.8b
|
|
|
|
add v4.4h, v2.4h , v3.4h
|
|
|
|
|
|
uaddlp v5.2s, v4.4h
|
|
movi d30, #0x00000000ffffffff
|
|
|
|
and v5.8b, v5.8b , v30.8b
|
|
|
|
mov v28.s[0], w14
|
|
mov v28.s[1], w5 //src[2nt+1]+2+src[2nt-1] moved to d28
|
|
add d6, d6 , d5 //accumulate all inp into d6 (end for nt==8)
|
|
|
|
sshl d18, d6, d7 //(dc_val) shr by log2nt+1
|
|
mov x8, x7 //&src[2nt+1]
|
|
|
|
shl d25, d18,#1 //2*dc
|
|
sub x9, x9, #3 //&src[2nt-1-row]
|
|
|
|
dup v16.8b, v18.b[0] //dc_val
|
|
add d27, d25 , d28 //src[2nt+1]+2+src[2nt-1]+2dc_val
|
|
|
|
ushr v29.4h, v27.4h,#2 //final dst[0]'s value in d15[0]
|
|
sub x12, x3, x3, lsl #2 //-3*strd
|
|
add d23, d25 , d18 //3*dc
|
|
|
|
add d23, d23 , d17 //3*dc + 2
|
|
add x12, x12, #4 //offset after one 4x4 block (-3*strd + 4)
|
|
|
|
dup v24.8h, v23.h[0] //3*dc + 2 (moved to all lanes)
|
|
sub x0, x3, x4 //strd - nt
|
|
|
|
|
|
ld1 {v0.8b},[x8] //col 1::3 load (prol)
|
|
ld1 {v1.8b},[x9] //row 3::1 (0 also) load (prol)
|
|
|
|
uxtl v20.8h, v0.8b
|
|
|
|
uxtl v22.8h, v1.8b
|
|
add v20.8h, v20.8h , v24.8h //col 1::7 add 3dc+2 (prol)
|
|
|
|
add v22.8h, v22.8h , v24.8h //row 1::7 add 3dc+2 (prol)
|
|
|
|
movi d19, #0x00000000000000ff //
|
|
sqshrun v2.8b, v20.8h,#2 //columns shx2 movn (prol)
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 1 (prol)
|
|
sqshrun v3.8b, v22.8h,#2 //rows shx2 movn (prol)
|
|
|
|
|
|
bsl v19.8b, v29.8b , v2.8b //first row with dst[0]
|
|
|
|
rev64 v3.8b, v3.8b
|
|
|
|
st1 {v19.s}[0],[x2], x3 //store row 0 (prol)
|
|
sshr d3, d3,#40 //row 0 shift (prol) (first value to be ignored)
|
|
|
|
movi d21, #0x00000000000000ff //byte mask row 2 (prol)
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 1 (prol)
|
|
sshr d3, d3,#8 //row 1 shift (prol)
|
|
|
|
st1 {v20.s}[0],[x2], x3 //store row 1 (prol)
|
|
|
|
bsl v21.8b, v3.8b , v16.8b //row 2 (prol)
|
|
|
|
movi d20, #0x00000000000000ff //byte mask row 3 (prol)
|
|
|
|
sshr d3, d3,#8 //row 2 shift (prol)
|
|
st1 {v21.s}[0],[x2], x3 //store row 2 (prol)
|
|
|
|
bsl v20.8b, v3.8b , v16.8b //row 3 (prol)
|
|
st1 {v20.s}[0],[x2] //store row 3 (prol)
|
|
|
|
epilogue_end:
|
|
end_func:
|
|
// ldmfd sp!,{x4-x12,x15} //reload the registers from sp
|
|
ldp x19, x20,[sp],#16
|
|
|
|
ret
|
|
|
|
|
|
|
|
|
|
|