226 lines
6.0 KiB
ArmAsm
226 lines
6.0 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* This file contains definitions of routines for spatial filter
|
|
//*
|
|
//* @author
|
|
//* Ittiam
|
|
//*
|
|
//* @par List of Functions:
|
|
//* - ideint_cac_8x8_av8()
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
|
|
|
|
//******************************************************************************
|
|
//*
|
|
//* @brief Calculates Combing Artifact
|
|
//*
|
|
//* @par Description
|
|
//* This functions calculates combing artifact check (CAC) for given two fields
|
|
//*
|
|
//* @param[in] pu1_top
|
|
//* UWORD8 pointer to top field
|
|
//*
|
|
//* @param[in] pu1_bot
|
|
//* UWORD8 pointer to bottom field
|
|
//*
|
|
//* @param[in] top_strd
|
|
//* Top field stride
|
|
//*
|
|
//* @param[in] bot_strd
|
|
//* Bottom field stride
|
|
//*
|
|
//* @returns
|
|
//* None
|
|
//*
|
|
//* @remarks
|
|
//*
|
|
//******************************************************************************
|
|
|
|
.global ideint_cac_8x8_av8
|
|
|
|
ideint_cac_8x8_av8:
|
|
|
|
// Load first row of top
|
|
ld1 {v28.8b}, [x0], x2
|
|
|
|
// Load first row of bottom
|
|
ld1 {v29.8b}, [x1], x3
|
|
mov v28.d[1], v29.d[0]
|
|
|
|
// Load second row of top
|
|
ld1 {v30.8b}, [x0], x2
|
|
|
|
// Load second row of bottom
|
|
ld1 {v31.8b}, [x1], x3
|
|
mov v30.d[1], v31.d[0]
|
|
|
|
|
|
// Calculate row based adj and alt values
|
|
// Get row sums
|
|
uaddlp v0.8h, v28.16b
|
|
|
|
uaddlp v2.8h, v30.16b
|
|
|
|
uaddlp v0.4s, v0.8h
|
|
|
|
uaddlp v2.4s, v2.8h
|
|
|
|
// Both v0 and v2 have four 32 bit sums corresponding to first 4 rows
|
|
// Pack v0 and v2 into a single register (sum does not exceed 16bits)
|
|
|
|
shl v16.4s, v2.4s, #16
|
|
orr v16.16b, v0.16b, v16.16b
|
|
// v16 now contains 8 sums
|
|
|
|
// Load third row of top
|
|
ld1 {v24.8b}, [x0], x2
|
|
|
|
// Load third row of bottom
|
|
ld1 {v25.8b}, [x1], x3
|
|
mov v24.d[1], v25.d[0]
|
|
|
|
// Load fourth row of top
|
|
ld1 {v26.8b}, [x0], x2
|
|
|
|
// Load fourth row of bottom
|
|
ld1 {v27.8b}, [x1], x3
|
|
mov v26.d[1], v27.d[0]
|
|
|
|
// Get row sums
|
|
uaddlp v4.8h, v24.16b
|
|
|
|
uaddlp v6.8h, v26.16b
|
|
|
|
uaddlp v4.4s, v4.8h
|
|
|
|
uaddlp v6.4s, v6.8h
|
|
// Both v4 and v6 have four 32 bit sums corresponding to last 4 rows
|
|
// Pack v4 and v6 into a single register (sum does not exceed 16bits)
|
|
|
|
shl v18.4s, v6.4s, #16
|
|
orr v18.16b, v4.16b, v18.16b
|
|
// v18 now contains 8 sums
|
|
|
|
// Compute absolute diff between top and bottom row sums
|
|
mov v17.d[0], v16.d[1]
|
|
uabd v16.4h, v16.4h, v17.4h
|
|
|
|
mov v19.d[0], v18.d[1]
|
|
uabd v17.4h, v18.4h, v19.4h
|
|
|
|
mov v16.d[1], v17.d[0]
|
|
|
|
// RSUM_CSUM_THRESH
|
|
movi v18.8h, #20
|
|
|
|
// Eliminate values smaller than RSUM_CSUM_THRESH
|
|
cmhs v20.8h, v16.8h, v18.8h
|
|
and v20.16b, v16.16b, v20.16b
|
|
|
|
// v20 now contains 8 absolute diff of sums above the threshold
|
|
|
|
// Compute adj
|
|
mov v21.d[0], v20.d[1]
|
|
add v20.4h, v20.4h, v21.4h
|
|
|
|
// v20 has four adj values for two sub-blocks
|
|
|
|
// Compute alt
|
|
uabd v0.4s, v0.4s, v2.4s
|
|
uabd v4.4s, v4.4s, v6.4s
|
|
|
|
add v0.4s, v0.4s, v4.4s
|
|
|
|
mov v1.d[0], v0.d[1]
|
|
add v21.4s, v0.4s, v1.4s
|
|
// d21 has two values for two sub-blocks
|
|
|
|
|
|
// Calculate column based adj and alt values
|
|
|
|
urhadd v0.16b, v28.16b, v30.16b
|
|
urhadd v2.16b, v24.16b, v26.16b
|
|
urhadd v0.16b, v0.16b, v2.16b
|
|
|
|
mov v1.d[0], v0.d[1]
|
|
uabd v0.8b, v0.8b, v1.8b
|
|
|
|
// RSUM_CSUM_THRESH >> 2
|
|
movi v22.16b, #5
|
|
|
|
// Eliminate values smaller than RSUM_CSUM_THRESH >> 2
|
|
cmhs v1.16b, v0.16b, v22.16b
|
|
and v0.16b, v0.16b, v1.16b
|
|
// d0 now contains 8 absolute diff of sums above the threshold
|
|
|
|
|
|
uaddlp v0.4h, v0.8b
|
|
shl v0.4h, v0.4h,#2
|
|
|
|
// Add row based adj
|
|
add v20.4h, v0.4h, v20.4h
|
|
|
|
uaddlp v20.2s, v20.4h
|
|
// d20 now contains 2 adj values
|
|
|
|
|
|
urhadd v0.8b, v28.8b, v29.8b
|
|
urhadd v2.8b, v24.8b, v25.8b
|
|
urhadd v0.8b, v0.8b, v2.8b
|
|
|
|
urhadd v1.8b, v30.8b, v31.8b
|
|
urhadd v3.8b, v26.8b, v27.8b
|
|
urhadd v1.8b, v1.8b, v3.8b
|
|
|
|
uabd v0.8b, v0.8b, v1.8b
|
|
uaddlp v0.4h, v0.8b
|
|
|
|
shl v0.4h, v0.4h, #2
|
|
uaddlp v0.2s, v0.4h
|
|
add v21.2s, v0.2s, v21.2s
|
|
|
|
|
|
// d21 now contains 2 alt values
|
|
|
|
// SAD_BIAS_MULT_SHIFT
|
|
ushr v0.2s, v21.2s, #3
|
|
add v21.2s, v21.2s, v0.2s
|
|
|
|
// SAD_BIAS_ADDITIVE >> 1
|
|
movi v0.2s, #4
|
|
add v21.2s, v21.2s, v0.2s
|
|
|
|
cmhi v0.2s, v20.2s, v21.2s
|
|
uaddlp v0.1d, v0.2s
|
|
|
|
smov x0, v0.s[0]
|
|
cmp x0, #0
|
|
mov x4, #1
|
|
csel x0, x4, x0, ne
|
|
ret
|