421 lines
17 KiB
ArmAsm
421 lines
17 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
///*
|
|
////----------------------------------------------------------------------------
|
|
//// File Name : impeg2_format_conv.s
|
|
////
|
|
//// Description : This file has the Idct Implementations for the
|
|
//// MPEG4 SP decoder on neon platform.
|
|
////
|
|
//// Reference Document :
|
|
////
|
|
//// Revision History :
|
|
//// Date Author Detail Description
|
|
//// ------------ ---------------- ----------------------------------
|
|
//// Jul 07, 2008 Naveen Kumar T Created
|
|
////
|
|
////-------------------------------------------------------------------------
|
|
//*/
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Include Files
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
.set log2_16 , 4
|
|
.set log2_2 , 1
|
|
|
|
.text
|
|
.include "impeg2_neon_macros.s"
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Struct/Union Types and Define
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Static Global Data section variables
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
////--------------------------- NONE --------------------------------------------
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Static Prototype Functions
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
//// -------------------------- NONE --------------------------------------------
|
|
|
|
///*
|
|
//// ----------------------------------------------------------------------------
|
|
//// Exported functions
|
|
//// ----------------------------------------------------------------------------
|
|
//*/
|
|
|
|
|
|
///*****************************************************************************
|
|
//* *
|
|
//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8() *
|
|
//* *
|
|
//* Description : This function conversts the image from YUV420P color *
|
|
//* space to 420SP color space(UV interleaved). *
|
|
//* *
|
|
//* Arguments : x0 pu1_y *
|
|
//* x1 pu1_u *
|
|
//* x2 pu1_v *
|
|
//* x3 pu1_dest_y *
|
|
//* x4 pu1_dest_uv *
|
|
//* x5 u2_height *
|
|
//* x6 u2_width *
|
|
//* x7 u2_stridey *
|
|
//* sp, #80 u2_strideu *
|
|
//* sp, #88 u2_stridev *
|
|
//* sp, #96 u2_dest_stride_y *
|
|
//* sp, #104 u2_dest_stride_uv *
|
|
//* sp, #112 convert_uv_only *
|
|
//* *
|
|
//* Values Returned : None *
|
|
//* *
|
|
//* Register Usage : x8, x10, x16, x20, v0, v1 *
|
|
//* *
|
|
//* Stack Usage : 80 Bytes *
|
|
//* *
|
|
//* Interruptibility : Interruptible *
|
|
//* *
|
|
//* Known Limitations *
|
|
//* Assumptions: Image Width: Assumed to be multiple of 16 and *
|
|
//* greater than or equal to 16 *
|
|
//* Image Height: Assumed to be even. *
|
|
//* *
|
|
//* Revision History : *
|
|
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
|
|
//* 07 06 2010 Varshita Draft *
|
|
//* 07 06 2010 Naveen Kr T Completed *
|
|
//* *
|
|
//*****************************************************************************/
|
|
.global impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8
|
|
impeg2_fmt_conv_yuv420p_to_yuv420sp_uv_av8:
|
|
|
|
//// push the registers on the stack
|
|
// pu1_y, - x0
|
|
// pu1_u, - x1
|
|
// pu1_v, - x2
|
|
// pu1_dest_y, - x3
|
|
// pu1_dest_uv, - x4
|
|
// u2_height, - x5
|
|
// u2_width, - x6
|
|
// u2_stridey, - x7
|
|
// u2_strideu, - sp, #80
|
|
// u2_stridev, - sp, #88
|
|
// u2_dest_stride_y, - sp, #96
|
|
// u2_dest_stride_uv, - sp, #104
|
|
// convert_uv_only - sp, #112
|
|
// STMFD sp!,{x4-x12,x14}
|
|
push_v_regs
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
ldr w14, [sp, #112] //// Load convert_uv_only
|
|
|
|
cmp w14, #1
|
|
mov x9, x5
|
|
beq yuv420sp_uv_chroma
|
|
///* Do the preprocessing before the main loops start */
|
|
//// Load the parameters from stack
|
|
|
|
ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
|
|
uxtw x8, w8
|
|
|
|
sub x7, x7, x6 //// Source increment
|
|
|
|
sub x8, x8, x6 //// Destination increment
|
|
|
|
|
|
yuv420sp_uv_row_loop_y:
|
|
mov x16, x6
|
|
|
|
yuv420sp_uv_col_loop_y:
|
|
prfm pldl1keep, [x0, #128]
|
|
ld1 {v0.8b, v1.8b}, [x0], #16
|
|
st1 {v0.8b, v1.8b}, [x3], #16
|
|
sub x16, x16, #16
|
|
cmp x16, #15
|
|
bgt yuv420sp_uv_col_loop_y
|
|
|
|
cmp x16, #0
|
|
beq yuv420sp_uv_row_loop__y
|
|
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
////Ex if width is 162, above loop will process 160 pixels. And
|
|
////Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
//// and written using VLD1 and VST1
|
|
sub x20, x16, #16
|
|
neg x16, x20
|
|
sub x0, x0, x16
|
|
sub x3, x3, x16
|
|
|
|
ld1 {v0.8b, v1.8b}, [x0], #16
|
|
st1 {v0.8b, v1.8b}, [x3], #16
|
|
|
|
yuv420sp_uv_row_loop__y:
|
|
add x0, x0, x7
|
|
add x3, x3, x8
|
|
subs x5, x5, #1
|
|
bgt yuv420sp_uv_row_loop_y
|
|
|
|
yuv420sp_uv_chroma:
|
|
ldr w7, [sp, #88] //// Load u2_strideu from stack
|
|
sxtw x7, w7
|
|
|
|
ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
|
|
sxtw x8, w8
|
|
add x6, x6, 1
|
|
bic x6, x6, #1
|
|
|
|
add x9, x9, 1
|
|
|
|
sub x7, x7, x6, lsr #1 //// Source increment
|
|
|
|
sub x8, x8, x6 //// Destination increment
|
|
|
|
lsr x6, x6, #1
|
|
lsr x5, x9, #1
|
|
yuv420sp_uv_row_loop_uv:
|
|
mov x16, x6
|
|
|
|
|
|
yuv420sp_uv_col_loop_uv:
|
|
prfm pldl1keep, [x1, #128]
|
|
prfm pldl1keep, [x2, #128]
|
|
|
|
ld1 {v0.8b}, [x1], #8
|
|
ld1 {v1.8b}, [x2], #8
|
|
st2 {v0.8b, v1.8b}, [x4], #16
|
|
|
|
sub x16, x16, #8
|
|
cmp x16, #7
|
|
bgt yuv420sp_uv_col_loop_uv
|
|
|
|
cmp x16, #0
|
|
beq yuv420sp_uv_row_loop__uv
|
|
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
////Ex if width is 162, above loop will process 160 pixels. And
|
|
////Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
//// and written using VLD1 and VST1
|
|
sub x20, x16, #8
|
|
neg x16, x20
|
|
sub x1, x1, x16
|
|
sub x2, x2, x16
|
|
sub x4, x4, x16, lsl #1
|
|
|
|
ld1 {v0.8b}, [x1], #8
|
|
ld1 {v1.8b}, [x2], #8
|
|
st2 {v0.8b, v1.8b}, [x4], #16
|
|
|
|
yuv420sp_uv_row_loop__uv:
|
|
add x1, x1, x7
|
|
add x2, x2, x7
|
|
add x4, x4, x8
|
|
subs x5, x5, #1
|
|
bgt yuv420sp_uv_row_loop_uv
|
|
////POP THE REGISTERS
|
|
// LDMFD sp!,{x4-x12,PC}
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///*****************************************************************************
|
|
//* *
|
|
//* Function Name : impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8() *
|
|
//* *
|
|
//* Description : This function conversts the image from YUV420P color *
|
|
//* space to 420SP color space(VU interleaved). *
|
|
//* This function is similar to above function *
|
|
//* IMP4D_CXA8_YUV420toYUV420SP_VU with a difference in *
|
|
//* VLD1.8 for chroma - order of registers is different *
|
|
//* *
|
|
//* Arguments : x0 pu1_y *
|
|
//* x1 pu1_u *
|
|
//* x2 pu1_v *
|
|
//* x3 pu1_dest_y *
|
|
//* x4 pu1_dest_uv *
|
|
//* x5 u2_height *
|
|
//* x6 u2_width *
|
|
//* x7 u2_stridey *
|
|
//* sp, #80 u2_strideu *
|
|
//* sp, #88 u2_stridev *
|
|
//* sp, #96 u2_dest_stride_y *
|
|
//* sp, #104 u2_dest_stride_uv *
|
|
//* sp, #112 convert_uv_only *
|
|
//* *
|
|
//* Values Returned : None *
|
|
//* *
|
|
//* Register Usage : x8, x14, x16, x20, v0, v1 *
|
|
//* *
|
|
//* Stack Usage : 80 Bytes *
|
|
//* *
|
|
//* Interruptibility : Interruptible *
|
|
//* *
|
|
//* Known Limitations *
|
|
//* Assumptions: Image Width: Assumed to be multiple of 16 and *
|
|
//* greater than or equal to 16 *
|
|
//* Image Height: Assumed to be even. *
|
|
//* *
|
|
//* Revision History : *
|
|
//* DD MM YYYY Author(s) Changes (Describe the changes made) *
|
|
//* 07 06 2010 Varshita Draft *
|
|
//* 07 06 2010 Naveen Kr T Completed *
|
|
//* *
|
|
//*****************************************************************************/
|
|
|
|
.global impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8
|
|
impeg2_fmt_conv_yuv420p_to_yuv420sp_vu_av8:
|
|
|
|
//// push the registers on the stack
|
|
// pu1_y, - x0
|
|
// pu1_u, - x1
|
|
// pu1_v, - x2
|
|
// pu1_dest_y, - x3
|
|
// pu1_dest_uv, - x4
|
|
// u2_height, - x5
|
|
// u2_width, - x6
|
|
// u2_stridey, - x7
|
|
// u2_strideu, - sp, #80
|
|
// u2_stridev, - sp, #88
|
|
// u2_dest_stride_y, - sp, #96
|
|
// u2_dest_stride_uv, - sp, #104
|
|
// convert_uv_only - sp, #112
|
|
// STMFD sp!,{x4-x12,x14}
|
|
push_v_regs
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
ldr w14, [sp, #112] //// Load convert_uv_only
|
|
|
|
cmp w14, #1
|
|
mov x9, x5
|
|
beq yuv420sp_vu_chroma
|
|
|
|
///* Do the preprocessing before the main loops start */
|
|
//// Load the parameters from stack
|
|
|
|
ldr w8, [sp, #96] //// Load u2_dest_stride_y from stack
|
|
uxtw x8, w8
|
|
|
|
sub x7, x7, x6 //// Source increment
|
|
|
|
sub x8, x8, x6 //// Destination increment
|
|
|
|
|
|
yuv420sp_vu_row_loop_y:
|
|
mov x16, x6
|
|
|
|
yuv420sp_vu_col_loop_y:
|
|
prfm pldl1keep, [x0, #128]
|
|
ld1 {v0.8b, v1.8b}, [x0], #16
|
|
st1 {v0.8b, v1.8b}, [x3], #16
|
|
sub x16, x16, #16
|
|
cmp x16, #15
|
|
bgt yuv420sp_vu_col_loop_y
|
|
|
|
cmp x16, #0
|
|
beq yuv420sp_vu_row_loop__y
|
|
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
////Ex if width is 162, above loop will process 160 pixels. And
|
|
////Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
//// and written using VLD1 and VST1
|
|
sub x20, x16, #16
|
|
neg x16, x20
|
|
sub x0, x0, x16
|
|
sub x3, x3, x16
|
|
|
|
ld1 {v0.8b, v1.8b}, [x0], #16
|
|
st1 {v0.8b, v1.8b}, [x3], #16
|
|
|
|
yuv420sp_vu_row_loop__y:
|
|
add x0, x0, x7
|
|
add x3, x3, x8
|
|
subs x5, x5, #1
|
|
bgt yuv420sp_vu_row_loop_y
|
|
|
|
yuv420sp_vu_chroma:
|
|
ldr w7, [sp, #80] //// Load u2_strideu from stack
|
|
sxtw x7, w7
|
|
|
|
ldr w8, [sp, #104] //// Load u2_dest_stride_uv from stack
|
|
sxtw x8, w8
|
|
|
|
add x6, x6, 1
|
|
bic x6, x6, #1
|
|
|
|
add x9, x9, 1
|
|
|
|
sub x7, x7, x6, lsr #1 //// Source increment
|
|
|
|
sub x8, x8, x6 //// Destination increment
|
|
|
|
lsr x6, x6, #1
|
|
lsr x5, x9, #1
|
|
yuv420sp_vu_row_loop_uv:
|
|
mov x16, x6
|
|
|
|
|
|
yuv420sp_vu_col_loop_uv:
|
|
prfm pldl1keep, [x1, #128]
|
|
prfm pldl1keep, [x2, #128]
|
|
ld1 {v1.8b}, [x1], #8
|
|
ld1 {v0.8b}, [x2], #8
|
|
st2 {v0.8b, v1.8b}, [x4], #16
|
|
sub x16, x16, #8
|
|
cmp x16, #7
|
|
bgt yuv420sp_vu_col_loop_uv
|
|
|
|
cmp x16, #0
|
|
beq yuv420sp_vu_row_loop__uv
|
|
////If non-multiple of 16, then go back by few bytes to ensure 16 bytes can be read
|
|
////Ex if width is 162, above loop will process 160 pixels. And
|
|
////Both source and destination will point to 146th pixel and then 16 bytes will be read
|
|
//// and written using VLD1 and VST1
|
|
sub x20, x16, #8
|
|
neg x16, x20
|
|
sub x1, x1, x16
|
|
sub x2, x2, x16
|
|
sub x4, x4, x16, lsl #1
|
|
|
|
ld1 {v1.8b}, [x1], #8
|
|
ld1 {v0.8b}, [x2], #8
|
|
st2 {v0.8b, v1.8b}, [x4], #16
|
|
|
|
yuv420sp_vu_row_loop__uv:
|
|
add x1, x1, x7
|
|
add x2, x2, x7
|
|
add x4, x4, x8
|
|
subs x5, x5, #1
|
|
bgt yuv420sp_vu_row_loop_uv
|
|
////POP THE REGISTERS
|
|
// LDMFD sp!,{x4-x12,PC}
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|