221 lines
5.3 KiB
ArmAsm
221 lines
5.3 KiB
ArmAsm
///*****************************************************************************
|
|
//*
|
|
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************/
|
|
///**
|
|
///*******************************************************************************
|
|
//* //file
|
|
//* ihevcd_itrans_recon_dc_chroma.s
|
|
//*
|
|
//* //brief
|
|
//* contains function definitions itrans and recon for dc only case
|
|
//*
|
|
//* //author
|
|
//* ittiam
|
|
//*
|
|
//* //par list of functions:
|
|
//*
|
|
//*
|
|
//* //remarks
|
|
//* none
|
|
//*
|
|
//*******************************************************************************/
|
|
|
|
|
|
.text
|
|
.include "ihevc_neon_macros.s"
|
|
|
|
|
|
.globl ihevcd_itrans_recon_dc_chroma_av8
|
|
|
|
.type ihevcd_itrans_recon_dc_chroma_av8, %function
|
|
|
|
ihevcd_itrans_recon_dc_chroma_av8:
|
|
|
|
//void ihevcd_itrans_recon_dc_chroma(uword8 *pu1_pred,
|
|
// uword8 *pu1_dst,
|
|
// word32 pred_strd,
|
|
// word32 dst_strd,
|
|
// word32 log2_trans_size,
|
|
// word16 i2_coeff_value)
|
|
|
|
//x0:pu1_pred
|
|
//x1:pu1_dest
|
|
//x2:pred_strd
|
|
//x3:dst_strd
|
|
|
|
|
|
|
|
push_v_regs
|
|
stp x19, x20,[sp,#-16]!
|
|
|
|
sxth x5, w5 // since the argument is of word16, sign extend to x register
|
|
|
|
mov x10,#1
|
|
lsl x4,x10,x4 // trans_size = (1 << log2_trans_size)//
|
|
mov x6,#64 // 1 << (shift1 - 1)//
|
|
mov x7,#2048 // 1<<(shift2-1)
|
|
|
|
add x8,x6,x5,lsl #6
|
|
asr x20, x8, #7
|
|
mov x19,#32767
|
|
cmp x20,x19
|
|
blt lbl36
|
|
mov x8,#32767
|
|
b lbl36_1
|
|
lbl36:
|
|
mov x19,#-32768
|
|
cmp x20,x19
|
|
csel x8, x19, x20, lt
|
|
lbl36_1:
|
|
|
|
add x5,x7,x8,lsl #6
|
|
asr x20, x5, #12
|
|
mov x19,#32767
|
|
cmp x20,x19
|
|
blt lbl38
|
|
mov x6,#32767
|
|
b lbl38_1
|
|
lbl38:
|
|
mov x19,#-32768
|
|
cmp x20,x19
|
|
csel x6, x19, x20, lt
|
|
lbl38_1:
|
|
|
|
mov x9,x4
|
|
mov x8,x4
|
|
|
|
// x6 has the dc_value
|
|
// x4 has the trans_size value
|
|
// x8 has the row value
|
|
// x9 has the col value
|
|
dup v0.8h,w6
|
|
cmp x4,#4
|
|
beq row_loop_4chroma
|
|
|
|
|
|
row_loop_chroma:
|
|
mov x9,x4
|
|
|
|
|
|
col_loop_chroma:
|
|
|
|
mov x7,x0
|
|
ld2 {v2.8b, v3.8b},[x7],x2
|
|
ld2 {v4.8b, v5.8b},[x7],x2
|
|
ld2 {v6.8b, v7.8b},[x7],x2
|
|
ld2 {v8.8b, v9.8b},[x7],x2
|
|
|
|
ld2 {v10.8b, v11.8b},[x7],x2
|
|
ld2 {v12.8b, v13.8b},[x7],x2
|
|
ld2 {v14.8b, v15.8b},[x7],x2
|
|
ld2 {v16.8b, v17.8b},[x7]
|
|
|
|
add x0,x0,#16
|
|
|
|
|
|
uaddw v30.8h, v0.8h , v2.8b
|
|
uaddw v28.8h, v0.8h , v4.8b
|
|
uaddw v26.8h, v0.8h , v6.8b
|
|
uaddw v24.8h, v0.8h , v8.8b
|
|
uaddw v22.8h, v0.8h , v10.8b
|
|
uaddw v20.8h, v0.8h , v12.8b
|
|
uaddw v18.8h, v0.8h , v14.8b
|
|
|
|
|
|
mov x11,x1
|
|
sqxtun v2.8b, v30.8h
|
|
sqxtun v4.8b, v28.8h
|
|
sqxtun v6.8b, v26.8h
|
|
sqxtun v8.8b, v24.8h
|
|
|
|
uaddw v30.8h, v0.8h , v16.8b
|
|
|
|
sqxtun v10.8b, v22.8h
|
|
sqxtun v12.8b, v20.8h
|
|
sqxtun v14.8b, v18.8h
|
|
sqxtun v16.8b, v30.8h
|
|
|
|
st2 {v2.8b, v3.8b},[x11],x3
|
|
st2 {v4.8b, v5.8b},[x11],x3
|
|
st2 {v6.8b, v7.8b},[x11],x3
|
|
st2 {v8.8b, v9.8b},[x11],x3
|
|
|
|
st2 {v10.8b, v11.8b},[x11],x3
|
|
st2 {v12.8b, v13.8b},[x11],x3
|
|
st2 {v14.8b, v15.8b},[x11],x3
|
|
st2 {v16.8b, v17.8b},[x11]
|
|
|
|
add x1,x1,#16
|
|
|
|
subs x9,x9,#8
|
|
bgt col_loop_chroma
|
|
|
|
subs x8,x8,#8
|
|
|
|
add x0,x0,x2,lsl #3
|
|
add x1,x1,x3,lsl #3
|
|
sub x0,x0,x4,lsl #1
|
|
sub x1,x1,x4,lsl #1
|
|
bgt row_loop_chroma
|
|
b end_loops_chroma
|
|
|
|
|
|
row_loop_4chroma:
|
|
mov x9,x10
|
|
|
|
|
|
col_loop_4chroma:
|
|
|
|
|
|
ld2 {v2.8b, v3.8b},[x0],x2
|
|
ld2 {v4.8b, v5.8b},[x0],x2
|
|
ld2 {v6.8b, v7.8b},[x0],x2
|
|
ld2 {v8.8b, v9.8b},[x0]
|
|
|
|
|
|
|
|
|
|
uaddw v30.8h, v0.8h , v2.8b
|
|
uaddw v28.8h, v0.8h , v4.8b
|
|
uaddw v26.8h, v0.8h , v6.8b
|
|
uaddw v24.8h, v0.8h , v8.8b
|
|
|
|
|
|
|
|
sqxtun v31.8b, v30.8h
|
|
sqxtun v29.8b, v28.8h
|
|
sqxtun v27.8b, v26.8h
|
|
sqxtun v25.8b, v24.8h
|
|
|
|
|
|
zip1 v2.8b, v31.8b, v3.8b
|
|
zip1 v4.8b, v29.8b, v5.8b
|
|
zip1 v6.8b, v27.8b, v7.8b
|
|
zip1 v8.8b, v25.8b, v9.8b
|
|
|
|
st1 {v2.2s},[x1],x3
|
|
st1 {v4.2s},[x1],x3
|
|
st1 {v6.2s},[x1],x3
|
|
st1 {v8.2s},[x1]
|
|
|
|
end_loops_chroma:
|
|
ldp x19, x20,[sp],#16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|