1215 lines
51 KiB
C
1215 lines
51 KiB
C
|
/*
|
||
|
* Copyright (C) 2013 The Android Open Source Project
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
#ifndef ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
|
||
|
#define ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H
|
||
|
|
||
|
namespace android {
|
||
|
|
||
|
// depends on AudioResamplerFirOps.h, AudioResamplerFirProcess.h
|
||
|
|
||
|
#if USE_NEON
|
||
|
|
||
|
// use intrinsics if inline arm32 assembly is not possible
|
||
|
#if !USE_INLINE_ASSEMBLY
|
||
|
#define USE_INTRINSIC
|
||
|
#endif
|
||
|
|
||
|
// following intrinsics available only on ARM 64 bit ACLE
|
||
|
#ifndef __aarch64__
|
||
|
#undef vld1q_f32_x2
|
||
|
#undef vld1q_s32_x2
|
||
|
#endif
|
||
|
|
||
|
#define TO_STRING2(x) #x
|
||
|
#define TO_STRING(x) TO_STRING2(x)
|
||
|
// uncomment to print GCC version, may be relevant for intrinsic optimizations
|
||
|
/* #pragma message ("GCC version: " TO_STRING(__GNUC__) \
|
||
|
"." TO_STRING(__GNUC_MINOR__) \
|
||
|
"." TO_STRING(__GNUC_PATCHLEVEL__)) */
|
||
|
|
||
|
//
|
||
|
// NEON specializations are enabled for Process() and ProcessL() in AudioResamplerFirProcess.h
|
||
|
//
|
||
|
// Two variants are presented here:
|
||
|
// ARM NEON inline assembly which appears up to 10-15% faster than intrinsics (gcc 4.9) for arm32.
|
||
|
// ARM NEON intrinsics which can also be used by arm64 and x86/64 with NEON header.
|
||
|
//
|
||
|
|
||
|
// Macros to save a mono/stereo accumulator sample in q0 (and q4) as stereo out.
|
||
|
// These are only used for inline assembly.
|
||
|
#define ASSEMBLY_ACCUMULATE_MONO \
|
||
|
"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes */\
|
||
|
"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output */\
|
||
|
"vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums */\
|
||
|
"vpadd.s32 d0, d0, d0 \n"/* (1+4d) and replicate L/R */\
|
||
|
"vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume */\
|
||
|
"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating) */\
|
||
|
"vst1.s32 {d3}, %[out] \n"/* (2+2d) store result */
|
||
|
|
||
|
#define ASSEMBLY_ACCUMULATE_STEREO \
|
||
|
"vld1.s32 {d2}, [%[vLR]:64] \n"/* (1) load volumes*/\
|
||
|
"vld1.s32 {d3}, %[out] \n"/* (2) unaligned load the output*/\
|
||
|
"vpadd.s32 d0, d0, d1 \n"/* (1) add all 4 partial sums from q0*/\
|
||
|
"vpadd.s32 d8, d8, d9 \n"/* (1) add all 4 partial sums from q4*/\
|
||
|
"vpadd.s32 d0, d0, d8 \n"/* (1+4d) combine into L/R*/\
|
||
|
"vqrdmulh.s32 d0, d0, d2 \n"/* (2+3d) apply volume*/\
|
||
|
"vqadd.s32 d3, d3, d0 \n"/* (1+4d) accumulate result (saturating)*/\
|
||
|
"vst1.s32 {d3}, %[out] \n"/* (2+2d)store result*/
|
||
|
|
||
|
template <int CHANNELS, int STRIDE, bool FIXED>
|
||
|
static inline void ProcessNeonIntrinsic(int32_t* out,
|
||
|
int count,
|
||
|
const int16_t* coefsP,
|
||
|
const int16_t* coefsN,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
const int32_t* volumeLR,
|
||
|
uint32_t lerpP,
|
||
|
const int16_t* coefsP1,
|
||
|
const int16_t* coefsN1)
|
||
|
{
|
||
|
ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
|
||
|
static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
|
||
|
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
coefsP = (const int16_t*)__builtin_assume_aligned(coefsP, 16);
|
||
|
coefsN = (const int16_t*)__builtin_assume_aligned(coefsN, 16);
|
||
|
|
||
|
int16x4_t interp;
|
||
|
if (!FIXED) {
|
||
|
interp = vdup_n_s16(lerpP);
|
||
|
//interp = (int16x4_t)vset_lane_s32 ((int32x2_t)lerpP, interp, 0);
|
||
|
coefsP1 = (const int16_t*)__builtin_assume_aligned(coefsP1, 16);
|
||
|
coefsN1 = (const int16_t*)__builtin_assume_aligned(coefsN1, 16);
|
||
|
}
|
||
|
int32x4_t accum, accum2;
|
||
|
// warning uninitialized if we use veorq_s32
|
||
|
// (alternative to below) accum = veorq_s32(accum, accum);
|
||
|
accum = vdupq_n_s32(0);
|
||
|
if (CHANNELS == 2) {
|
||
|
// (alternative to below) accum2 = veorq_s32(accum2, accum2);
|
||
|
accum2 = vdupq_n_s32(0);
|
||
|
}
|
||
|
do {
|
||
|
int16x8_t posCoef = vld1q_s16(coefsP);
|
||
|
coefsP += 8;
|
||
|
int16x8_t negCoef = vld1q_s16(coefsN);
|
||
|
coefsN += 8;
|
||
|
if (!FIXED) { // interpolate
|
||
|
int16x8_t posCoef1 = vld1q_s16(coefsP1);
|
||
|
coefsP1 += 8;
|
||
|
int16x8_t negCoef1 = vld1q_s16(coefsN1);
|
||
|
coefsN1 += 8;
|
||
|
|
||
|
posCoef1 = vsubq_s16(posCoef1, posCoef);
|
||
|
negCoef = vsubq_s16(negCoef, negCoef1);
|
||
|
|
||
|
posCoef1 = vqrdmulhq_lane_s16(posCoef1, interp, 0);
|
||
|
negCoef = vqrdmulhq_lane_s16(negCoef, interp, 0);
|
||
|
|
||
|
posCoef = vaddq_s16(posCoef, posCoef1);
|
||
|
negCoef = vaddq_s16(negCoef, negCoef1);
|
||
|
}
|
||
|
switch (CHANNELS) {
|
||
|
case 1: {
|
||
|
int16x8_t posSamp = vld1q_s16(sP);
|
||
|
int16x8_t negSamp = vld1q_s16(sN);
|
||
|
sN += 8;
|
||
|
posSamp = vrev64q_s16(posSamp);
|
||
|
|
||
|
// dot product
|
||
|
accum = vmlal_s16(accum, vget_low_s16(posSamp), vget_high_s16(posCoef)); // reversed
|
||
|
accum = vmlal_s16(accum, vget_high_s16(posSamp), vget_low_s16(posCoef)); // reversed
|
||
|
accum = vmlal_s16(accum, vget_low_s16(negSamp), vget_low_s16(negCoef));
|
||
|
accum = vmlal_s16(accum, vget_high_s16(negSamp), vget_high_s16(negCoef));
|
||
|
sP -= 8;
|
||
|
} break;
|
||
|
case 2: {
|
||
|
int16x8x2_t posSamp = vld2q_s16(sP);
|
||
|
int16x8x2_t negSamp = vld2q_s16(sN);
|
||
|
sN += 16;
|
||
|
posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
|
||
|
posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
|
||
|
|
||
|
// dot product
|
||
|
accum = vmlal_s16(accum, vget_low_s16(posSamp.val[0]), vget_high_s16(posCoef)); // r
|
||
|
accum = vmlal_s16(accum, vget_high_s16(posSamp.val[0]), vget_low_s16(posCoef)); // r
|
||
|
accum2 = vmlal_s16(accum2, vget_low_s16(posSamp.val[1]), vget_high_s16(posCoef)); // r
|
||
|
accum2 = vmlal_s16(accum2, vget_high_s16(posSamp.val[1]), vget_low_s16(posCoef)); // r
|
||
|
accum = vmlal_s16(accum, vget_low_s16(negSamp.val[0]), vget_low_s16(negCoef));
|
||
|
accum = vmlal_s16(accum, vget_high_s16(negSamp.val[0]), vget_high_s16(negCoef));
|
||
|
accum2 = vmlal_s16(accum2, vget_low_s16(negSamp.val[1]), vget_low_s16(negCoef));
|
||
|
accum2 = vmlal_s16(accum2, vget_high_s16(negSamp.val[1]), vget_high_s16(negCoef));
|
||
|
sP -= 16;
|
||
|
} break;
|
||
|
}
|
||
|
} while (count -= 8);
|
||
|
|
||
|
// multiply by volume and save
|
||
|
volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
|
||
|
int32x2_t vLR = vld1_s32(volumeLR);
|
||
|
int32x2_t outSamp = vld1_s32(out);
|
||
|
// combine and funnel down accumulator
|
||
|
int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
|
||
|
if (CHANNELS == 1) {
|
||
|
// duplicate accum to both L and R
|
||
|
outAccum = vpadd_s32(outAccum, outAccum);
|
||
|
} else if (CHANNELS == 2) {
|
||
|
// accum2 contains R, fold in
|
||
|
int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
|
||
|
outAccum = vpadd_s32(outAccum, outAccum2);
|
||
|
}
|
||
|
outAccum = vqrdmulh_s32(outAccum, vLR);
|
||
|
outSamp = vqadd_s32(outSamp, outAccum);
|
||
|
vst1_s32(out, outSamp);
|
||
|
}
|
||
|
|
||
|
template <int CHANNELS, int STRIDE, bool FIXED>
|
||
|
static inline void ProcessNeonIntrinsic(int32_t* out,
|
||
|
int count,
|
||
|
const int32_t* coefsP,
|
||
|
const int32_t* coefsN,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
const int32_t* volumeLR,
|
||
|
uint32_t lerpP,
|
||
|
const int32_t* coefsP1,
|
||
|
const int32_t* coefsN1)
|
||
|
{
|
||
|
ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
|
||
|
static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
|
||
|
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
coefsP = (const int32_t*)__builtin_assume_aligned(coefsP, 16);
|
||
|
coefsN = (const int32_t*)__builtin_assume_aligned(coefsN, 16);
|
||
|
|
||
|
int32x2_t interp;
|
||
|
if (!FIXED) {
|
||
|
interp = vdup_n_s32(lerpP);
|
||
|
coefsP1 = (const int32_t*)__builtin_assume_aligned(coefsP1, 16);
|
||
|
coefsN1 = (const int32_t*)__builtin_assume_aligned(coefsN1, 16);
|
||
|
}
|
||
|
int32x4_t accum, accum2;
|
||
|
// warning uninitialized if we use veorq_s32
|
||
|
// (alternative to below) accum = veorq_s32(accum, accum);
|
||
|
accum = vdupq_n_s32(0);
|
||
|
if (CHANNELS == 2) {
|
||
|
// (alternative to below) accum2 = veorq_s32(accum2, accum2);
|
||
|
accum2 = vdupq_n_s32(0);
|
||
|
}
|
||
|
do {
|
||
|
#ifdef vld1q_s32_x2
|
||
|
int32x4x2_t posCoef = vld1q_s32_x2(coefsP);
|
||
|
coefsP += 8;
|
||
|
int32x4x2_t negCoef = vld1q_s32_x2(coefsN);
|
||
|
coefsN += 8;
|
||
|
#else
|
||
|
int32x4x2_t posCoef;
|
||
|
posCoef.val[0] = vld1q_s32(coefsP);
|
||
|
coefsP += 4;
|
||
|
posCoef.val[1] = vld1q_s32(coefsP);
|
||
|
coefsP += 4;
|
||
|
int32x4x2_t negCoef;
|
||
|
negCoef.val[0] = vld1q_s32(coefsN);
|
||
|
coefsN += 4;
|
||
|
negCoef.val[1] = vld1q_s32(coefsN);
|
||
|
coefsN += 4;
|
||
|
#endif
|
||
|
if (!FIXED) { // interpolate
|
||
|
#ifdef vld1q_s32_x2
|
||
|
int32x4x2_t posCoef1 = vld1q_s32_x2(coefsP1);
|
||
|
coefsP1 += 8;
|
||
|
int32x4x2_t negCoef1 = vld1q_s32_x2(coefsN1);
|
||
|
coefsN1 += 8;
|
||
|
#else
|
||
|
int32x4x2_t posCoef1;
|
||
|
posCoef1.val[0] = vld1q_s32(coefsP1);
|
||
|
coefsP1 += 4;
|
||
|
posCoef1.val[1] = vld1q_s32(coefsP1);
|
||
|
coefsP1 += 4;
|
||
|
int32x4x2_t negCoef1;
|
||
|
negCoef1.val[0] = vld1q_s32(coefsN1);
|
||
|
coefsN1 += 4;
|
||
|
negCoef1.val[1] = vld1q_s32(coefsN1);
|
||
|
coefsN1 += 4;
|
||
|
#endif
|
||
|
|
||
|
posCoef1.val[0] = vsubq_s32(posCoef1.val[0], posCoef.val[0]);
|
||
|
posCoef1.val[1] = vsubq_s32(posCoef1.val[1], posCoef.val[1]);
|
||
|
negCoef.val[0] = vsubq_s32(negCoef.val[0], negCoef1.val[0]);
|
||
|
negCoef.val[1] = vsubq_s32(negCoef.val[1], negCoef1.val[1]);
|
||
|
|
||
|
posCoef1.val[0] = vqrdmulhq_lane_s32(posCoef1.val[0], interp, 0);
|
||
|
posCoef1.val[1] = vqrdmulhq_lane_s32(posCoef1.val[1], interp, 0);
|
||
|
negCoef.val[0] = vqrdmulhq_lane_s32(negCoef.val[0], interp, 0);
|
||
|
negCoef.val[1] = vqrdmulhq_lane_s32(negCoef.val[1], interp, 0);
|
||
|
|
||
|
posCoef.val[0] = vaddq_s32(posCoef.val[0], posCoef1.val[0]);
|
||
|
posCoef.val[1] = vaddq_s32(posCoef.val[1], posCoef1.val[1]);
|
||
|
negCoef.val[0] = vaddq_s32(negCoef.val[0], negCoef1.val[0]);
|
||
|
negCoef.val[1] = vaddq_s32(negCoef.val[1], negCoef1.val[1]);
|
||
|
}
|
||
|
switch (CHANNELS) {
|
||
|
case 1: {
|
||
|
int16x8_t posSamp = vld1q_s16(sP);
|
||
|
int16x8_t negSamp = vld1q_s16(sN);
|
||
|
sN += 8;
|
||
|
posSamp = vrev64q_s16(posSamp);
|
||
|
|
||
|
int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp), 15);
|
||
|
int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp), 15);
|
||
|
int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp), 15);
|
||
|
int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp), 15);
|
||
|
|
||
|
// dot product
|
||
|
posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
|
||
|
posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
|
||
|
negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
|
||
|
negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
|
||
|
|
||
|
accum = vaddq_s32(accum, posSamp0);
|
||
|
negSamp0 = vaddq_s32(negSamp0, negSamp1);
|
||
|
accum = vaddq_s32(accum, posSamp1);
|
||
|
accum = vaddq_s32(accum, negSamp0);
|
||
|
|
||
|
sP -= 8;
|
||
|
} break;
|
||
|
case 2: {
|
||
|
int16x8x2_t posSamp = vld2q_s16(sP);
|
||
|
int16x8x2_t negSamp = vld2q_s16(sN);
|
||
|
sN += 16;
|
||
|
posSamp.val[0] = vrev64q_s16(posSamp.val[0]);
|
||
|
posSamp.val[1] = vrev64q_s16(posSamp.val[1]);
|
||
|
|
||
|
// left
|
||
|
int32x4_t posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[0]), 15);
|
||
|
int32x4_t posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[0]), 15);
|
||
|
int32x4_t negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[0]), 15);
|
||
|
int32x4_t negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[0]), 15);
|
||
|
|
||
|
// dot product
|
||
|
posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
|
||
|
posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
|
||
|
negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
|
||
|
negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
|
||
|
|
||
|
accum = vaddq_s32(accum, posSamp0);
|
||
|
negSamp0 = vaddq_s32(negSamp0, negSamp1);
|
||
|
accum = vaddq_s32(accum, posSamp1);
|
||
|
accum = vaddq_s32(accum, negSamp0);
|
||
|
|
||
|
// right
|
||
|
posSamp0 = vshll_n_s16(vget_low_s16(posSamp.val[1]), 15);
|
||
|
posSamp1 = vshll_n_s16(vget_high_s16(posSamp.val[1]), 15);
|
||
|
negSamp0 = vshll_n_s16(vget_low_s16(negSamp.val[1]), 15);
|
||
|
negSamp1 = vshll_n_s16(vget_high_s16(negSamp.val[1]), 15);
|
||
|
|
||
|
// dot product
|
||
|
posSamp0 = vqrdmulhq_s32(posSamp0, posCoef.val[1]); // reversed
|
||
|
posSamp1 = vqrdmulhq_s32(posSamp1, posCoef.val[0]); // reversed
|
||
|
negSamp0 = vqrdmulhq_s32(negSamp0, negCoef.val[0]);
|
||
|
negSamp1 = vqrdmulhq_s32(negSamp1, negCoef.val[1]);
|
||
|
|
||
|
accum2 = vaddq_s32(accum2, posSamp0);
|
||
|
negSamp0 = vaddq_s32(negSamp0, negSamp1);
|
||
|
accum2 = vaddq_s32(accum2, posSamp1);
|
||
|
accum2 = vaddq_s32(accum2, negSamp0);
|
||
|
|
||
|
sP -= 16;
|
||
|
} break;
|
||
|
}
|
||
|
} while (count -= 8);
|
||
|
|
||
|
// multiply by volume and save
|
||
|
volumeLR = (const int32_t*)__builtin_assume_aligned(volumeLR, 8);
|
||
|
int32x2_t vLR = vld1_s32(volumeLR);
|
||
|
int32x2_t outSamp = vld1_s32(out);
|
||
|
// combine and funnel down accumulator
|
||
|
int32x2_t outAccum = vpadd_s32(vget_low_s32(accum), vget_high_s32(accum));
|
||
|
if (CHANNELS == 1) {
|
||
|
// duplicate accum to both L and R
|
||
|
outAccum = vpadd_s32(outAccum, outAccum);
|
||
|
} else if (CHANNELS == 2) {
|
||
|
// accum2 contains R, fold in
|
||
|
int32x2_t outAccum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
|
||
|
outAccum = vpadd_s32(outAccum, outAccum2);
|
||
|
}
|
||
|
outAccum = vqrdmulh_s32(outAccum, vLR);
|
||
|
outSamp = vqadd_s32(outSamp, outAccum);
|
||
|
vst1_s32(out, outSamp);
|
||
|
}
|
||
|
|
||
|
template <int CHANNELS, int STRIDE, bool FIXED>
|
||
|
static inline void ProcessNeonIntrinsic(float* out,
|
||
|
int count,
|
||
|
const float* coefsP,
|
||
|
const float* coefsN,
|
||
|
const float* sP,
|
||
|
const float* sN,
|
||
|
const float* volumeLR,
|
||
|
float lerpP,
|
||
|
const float* coefsP1,
|
||
|
const float* coefsN1)
|
||
|
{
|
||
|
ALOG_ASSERT(count > 0 && (count & 7) == 0); // multiple of 8
|
||
|
static_assert(CHANNELS == 1 || CHANNELS == 2, "CHANNELS must be 1 or 2");
|
||
|
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
coefsP = (const float*)__builtin_assume_aligned(coefsP, 16);
|
||
|
coefsN = (const float*)__builtin_assume_aligned(coefsN, 16);
|
||
|
|
||
|
float32x2_t interp;
|
||
|
if (!FIXED) {
|
||
|
interp = vdup_n_f32(lerpP);
|
||
|
coefsP1 = (const float*)__builtin_assume_aligned(coefsP1, 16);
|
||
|
coefsN1 = (const float*)__builtin_assume_aligned(coefsN1, 16);
|
||
|
}
|
||
|
float32x4_t accum, accum2;
|
||
|
// warning uninitialized if we use veorq_s32
|
||
|
// (alternative to below) accum = veorq_s32(accum, accum);
|
||
|
accum = vdupq_n_f32(0);
|
||
|
if (CHANNELS == 2) {
|
||
|
// (alternative to below) accum2 = veorq_s32(accum2, accum2);
|
||
|
accum2 = vdupq_n_f32(0);
|
||
|
}
|
||
|
do {
|
||
|
#ifdef vld1q_f32_x2
|
||
|
float32x4x2_t posCoef = vld1q_f32_x2(coefsP);
|
||
|
coefsP += 8;
|
||
|
float32x4x2_t negCoef = vld1q_f32_x2(coefsN);
|
||
|
coefsN += 8;
|
||
|
#else
|
||
|
float32x4x2_t posCoef;
|
||
|
posCoef.val[0] = vld1q_f32(coefsP);
|
||
|
coefsP += 4;
|
||
|
posCoef.val[1] = vld1q_f32(coefsP);
|
||
|
coefsP += 4;
|
||
|
float32x4x2_t negCoef;
|
||
|
negCoef.val[0] = vld1q_f32(coefsN);
|
||
|
coefsN += 4;
|
||
|
negCoef.val[1] = vld1q_f32(coefsN);
|
||
|
coefsN += 4;
|
||
|
#endif
|
||
|
if (!FIXED) { // interpolate
|
||
|
#ifdef vld1q_f32_x2
|
||
|
float32x4x2_t posCoef1 = vld1q_f32_x2(coefsP1);
|
||
|
coefsP1 += 8;
|
||
|
float32x4x2_t negCoef1 = vld1q_f32_x2(coefsN1);
|
||
|
coefsN1 += 8;
|
||
|
#else
|
||
|
float32x4x2_t posCoef1;
|
||
|
posCoef1.val[0] = vld1q_f32(coefsP1);
|
||
|
coefsP1 += 4;
|
||
|
posCoef1.val[1] = vld1q_f32(coefsP1);
|
||
|
coefsP1 += 4;
|
||
|
float32x4x2_t negCoef1;
|
||
|
negCoef1.val[0] = vld1q_f32(coefsN1);
|
||
|
coefsN1 += 4;
|
||
|
negCoef1.val[1] = vld1q_f32(coefsN1);
|
||
|
coefsN1 += 4;
|
||
|
#endif
|
||
|
posCoef1.val[0] = vsubq_f32(posCoef1.val[0], posCoef.val[0]);
|
||
|
posCoef1.val[1] = vsubq_f32(posCoef1.val[1], posCoef.val[1]);
|
||
|
negCoef.val[0] = vsubq_f32(negCoef.val[0], negCoef1.val[0]);
|
||
|
negCoef.val[1] = vsubq_f32(negCoef.val[1], negCoef1.val[1]);
|
||
|
|
||
|
posCoef.val[0] = vmlaq_lane_f32(posCoef.val[0], posCoef1.val[0], interp, 0);
|
||
|
posCoef.val[1] = vmlaq_lane_f32(posCoef.val[1], posCoef1.val[1], interp, 0);
|
||
|
negCoef.val[0] = vmlaq_lane_f32(negCoef1.val[0], negCoef.val[0], interp, 0); // rev
|
||
|
negCoef.val[1] = vmlaq_lane_f32(negCoef1.val[1], negCoef.val[1], interp, 0); // rev
|
||
|
}
|
||
|
switch (CHANNELS) {
|
||
|
case 1: {
|
||
|
#ifdef vld1q_f32_x2
|
||
|
float32x4x2_t posSamp = vld1q_f32_x2(sP);
|
||
|
float32x4x2_t negSamp = vld1q_f32_x2(sN);
|
||
|
sN += 8;
|
||
|
sP -= 8;
|
||
|
#else
|
||
|
float32x4x2_t posSamp;
|
||
|
posSamp.val[0] = vld1q_f32(sP);
|
||
|
sP += 4;
|
||
|
posSamp.val[1] = vld1q_f32(sP);
|
||
|
sP -= 12;
|
||
|
float32x4x2_t negSamp;
|
||
|
negSamp.val[0] = vld1q_f32(sN);
|
||
|
sN += 4;
|
||
|
negSamp.val[1] = vld1q_f32(sN);
|
||
|
sN += 4;
|
||
|
#endif
|
||
|
// effectively we want a vrev128q_f32()
|
||
|
posSamp.val[0] = vrev64q_f32(posSamp.val[0]);
|
||
|
posSamp.val[1] = vrev64q_f32(posSamp.val[1]);
|
||
|
posSamp.val[0] = vcombine_f32(
|
||
|
vget_high_f32(posSamp.val[0]), vget_low_f32(posSamp.val[0]));
|
||
|
posSamp.val[1] = vcombine_f32(
|
||
|
vget_high_f32(posSamp.val[1]), vget_low_f32(posSamp.val[1]));
|
||
|
|
||
|
accum = vmlaq_f32(accum, posSamp.val[0], posCoef.val[1]);
|
||
|
accum = vmlaq_f32(accum, posSamp.val[1], posCoef.val[0]);
|
||
|
accum = vmlaq_f32(accum, negSamp.val[0], negCoef.val[0]);
|
||
|
accum = vmlaq_f32(accum, negSamp.val[1], negCoef.val[1]);
|
||
|
} break;
|
||
|
case 2: {
|
||
|
float32x4x2_t posSamp0 = vld2q_f32(sP);
|
||
|
sP += 8;
|
||
|
float32x4x2_t negSamp0 = vld2q_f32(sN);
|
||
|
sN += 8;
|
||
|
posSamp0.val[0] = vrev64q_f32(posSamp0.val[0]);
|
||
|
posSamp0.val[1] = vrev64q_f32(posSamp0.val[1]);
|
||
|
posSamp0.val[0] = vcombine_f32(
|
||
|
vget_high_f32(posSamp0.val[0]), vget_low_f32(posSamp0.val[0]));
|
||
|
posSamp0.val[1] = vcombine_f32(
|
||
|
vget_high_f32(posSamp0.val[1]), vget_low_f32(posSamp0.val[1]));
|
||
|
|
||
|
float32x4x2_t posSamp1 = vld2q_f32(sP);
|
||
|
sP -= 24;
|
||
|
float32x4x2_t negSamp1 = vld2q_f32(sN);
|
||
|
sN += 8;
|
||
|
posSamp1.val[0] = vrev64q_f32(posSamp1.val[0]);
|
||
|
posSamp1.val[1] = vrev64q_f32(posSamp1.val[1]);
|
||
|
posSamp1.val[0] = vcombine_f32(
|
||
|
vget_high_f32(posSamp1.val[0]), vget_low_f32(posSamp1.val[0]));
|
||
|
posSamp1.val[1] = vcombine_f32(
|
||
|
vget_high_f32(posSamp1.val[1]), vget_low_f32(posSamp1.val[1]));
|
||
|
|
||
|
// Note: speed is affected by accumulation order.
|
||
|
// Also, speed appears slower using vmul/vadd instead of vmla for
|
||
|
// stereo case, comparable for mono.
|
||
|
|
||
|
accum = vmlaq_f32(accum, negSamp0.val[0], negCoef.val[0]);
|
||
|
accum = vmlaq_f32(accum, negSamp1.val[0], negCoef.val[1]);
|
||
|
accum2 = vmlaq_f32(accum2, negSamp0.val[1], negCoef.val[0]);
|
||
|
accum2 = vmlaq_f32(accum2, negSamp1.val[1], negCoef.val[1]);
|
||
|
|
||
|
accum = vmlaq_f32(accum, posSamp0.val[0], posCoef.val[1]); // reversed
|
||
|
accum = vmlaq_f32(accum, posSamp1.val[0], posCoef.val[0]); // reversed
|
||
|
accum2 = vmlaq_f32(accum2, posSamp0.val[1], posCoef.val[1]); // reversed
|
||
|
accum2 = vmlaq_f32(accum2, posSamp1.val[1], posCoef.val[0]); // reversed
|
||
|
} break;
|
||
|
}
|
||
|
} while (count -= 8);
|
||
|
|
||
|
// multiply by volume and save
|
||
|
volumeLR = (const float*)__builtin_assume_aligned(volumeLR, 8);
|
||
|
float32x2_t vLR = vld1_f32(volumeLR);
|
||
|
float32x2_t outSamp = vld1_f32(out);
|
||
|
// combine and funnel down accumulator
|
||
|
float32x2_t outAccum = vpadd_f32(vget_low_f32(accum), vget_high_f32(accum));
|
||
|
if (CHANNELS == 1) {
|
||
|
// duplicate accum to both L and R
|
||
|
outAccum = vpadd_f32(outAccum, outAccum);
|
||
|
} else if (CHANNELS == 2) {
|
||
|
// accum2 contains R, fold in
|
||
|
float32x2_t outAccum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
|
||
|
outAccum = vpadd_f32(outAccum, outAccum2);
|
||
|
}
|
||
|
outSamp = vmla_f32(outSamp, outAccum, vLR);
|
||
|
vst1_f32(out, outSamp);
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void ProcessL<1, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int16_t* coefsP,
|
||
|
const int16_t* coefsN,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
|
||
|
#else
|
||
|
const int CHANNELS = 1; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
|
||
|
"vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
|
||
|
"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
|
||
|
|
||
|
// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
|
||
|
"vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply (reversed)samples by coef
|
||
|
"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed)samples by coef
|
||
|
"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
|
||
|
"vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
|
||
|
|
||
|
// moving these ARM instructions before neon above seems to be slower
|
||
|
"subs %[count], %[count], #8 \n"// (1) update loop counter
|
||
|
"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
|
||
|
|
||
|
// sP used after branch (warning)
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_MONO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q8", "q10"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void ProcessL<2, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int16_t* coefsP,
|
||
|
const int16_t* coefsN,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
|
||
|
#else
|
||
|
const int CHANNELS = 2; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"veor q0, q0, q0 \n"// (1) acc_L = 0
|
||
|
"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
|
||
|
"vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
|
||
|
"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
"vld1.16 {q10}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
|
||
|
"vrev64.16 q3, q3 \n"// (0 combines+) reverse positive right
|
||
|
|
||
|
"vmlal.s16 q0, d4, d17 \n"// (1) multiply (reversed) samples left
|
||
|
"vmlal.s16 q0, d5, d16 \n"// (1) multiply (reversed) samples left
|
||
|
"vmlal.s16 q4, d6, d17 \n"// (1) multiply (reversed) samples right
|
||
|
"vmlal.s16 q4, d7, d16 \n"// (1) multiply (reversed) samples right
|
||
|
"vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
|
||
|
"vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
|
||
|
"vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
|
||
|
"vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
|
||
|
|
||
|
// moving these ARM before neon seems to be slower
|
||
|
"subs %[count], %[count], #8 \n"// (1) update loop counter
|
||
|
"sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
|
||
|
|
||
|
// sP used after branch (warning)
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_STEREO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q4", "q5", "q6",
|
||
|
"q8", "q10"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void Process<1, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int16_t* coefsP,
|
||
|
const int16_t* coefsN,
|
||
|
const int16_t* coefsP1,
|
||
|
const int16_t* coefsN1,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
uint32_t lerpP,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
lerpP, coefsP1, coefsN1);
|
||
|
#else
|
||
|
|
||
|
const int CHANNELS = 1; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"vmov.32 d2[0], %[lerpP] \n"// load the positive phase S32 Q15
|
||
|
"veor q0, q0, q0 \n"// (0 - combines+) accumulator = 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld1.16 {q2}, [%[sP]] \n"// (2+0d) load 8 16-bits mono samples
|
||
|
"vld1.16 {q3}, [%[sN]]! \n"// (2) load 8 16-bits mono samples
|
||
|
"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
|
||
|
"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
"vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
|
||
|
|
||
|
"vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
|
||
|
"vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
|
||
|
|
||
|
"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
|
||
|
"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// (1) reverse s3, s2, s1, s0, s7, s6, s5, s4
|
||
|
|
||
|
"vadd.s16 q8, q8, q9 \n"// (1+2d) interpolate (step3) 1st set
|
||
|
"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
|
||
|
|
||
|
// reordering the vmal to do d6, d7 before d4, d5 is slower(?)
|
||
|
"vmlal.s16 q0, d4, d17 \n"// (1+0d) multiply reversed samples by coef
|
||
|
"vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples by coef
|
||
|
"vmlal.s16 q0, d6, d20 \n"// (1) multiply neg samples
|
||
|
"vmlal.s16 q0, d7, d21 \n"// (1) multiply neg samples
|
||
|
|
||
|
// moving these ARM instructions before neon above seems to be slower
|
||
|
"subs %[count], %[count], #8 \n"// (1) update loop counter
|
||
|
"sub %[sP], %[sP], #16 \n"// (0) move pointer to next set of samples
|
||
|
|
||
|
// sP used after branch (warning)
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_MONO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[coefsP1] "+r" (coefsP1),
|
||
|
[coefsN1] "+r" (coefsN1),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [lerpP] "r" (lerpP),
|
||
|
[vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q8", "q9", "q10", "q11"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void Process<2, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int16_t* coefsP,
|
||
|
const int16_t* coefsN,
|
||
|
const int16_t* coefsP1,
|
||
|
const int16_t* coefsN1,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
uint32_t lerpP,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
lerpP, coefsP1, coefsN1);
|
||
|
#else
|
||
|
const int CHANNELS = 2; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
|
||
|
"veor q0, q0, q0 \n"// (1) acc_L = 0
|
||
|
"veor q4, q4, q4 \n"// (0 combines+) acc_R = 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld2.16 {q2, q3}, [%[sP]] \n"// (3+0d) load 8 16-bits stereo frames
|
||
|
"vld2.16 {q5, q6}, [%[sN]]! \n"// (3) load 8 16-bits stereo frames
|
||
|
"vld1.16 {q8}, [%[coefsP0]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
"vld1.16 {q9}, [%[coefsP1]:128]! \n"// (1) load 8 16-bits coefs for interpolation
|
||
|
"vld1.16 {q10}, [%[coefsN1]:128]! \n"// (1) load 8 16-bits coefs
|
||
|
"vld1.16 {q11}, [%[coefsN0]:128]! \n"// (1) load 8 16-bits coefs for interpolation
|
||
|
|
||
|
"vsub.s16 q9, q9, q8 \n"// (1) interpolate (step1) 1st set of coefs
|
||
|
"vsub.s16 q11, q11, q10 \n"// (1) interpolate (step1) 2nd set of coets
|
||
|
|
||
|
"vqrdmulh.s16 q9, q9, d2[0] \n"// (2) interpolate (step2) 1st set of coefs
|
||
|
"vqrdmulh.s16 q11, q11, d2[0] \n"// (2) interpolate (step2) 2nd set of coefs
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// (1) reverse 8 samples of positive left
|
||
|
"vrev64.16 q3, q3 \n"// (1) reverse 8 samples of positive right
|
||
|
|
||
|
"vadd.s16 q8, q8, q9 \n"// (1+1d) interpolate (step3) 1st set
|
||
|
"vadd.s16 q10, q10, q11 \n"// (1+1d) interpolate (step3) 2nd set
|
||
|
|
||
|
"vmlal.s16 q0, d4, d17 \n"// (1) multiply reversed samples left
|
||
|
"vmlal.s16 q0, d5, d16 \n"// (1) multiply reversed samples left
|
||
|
"vmlal.s16 q4, d6, d17 \n"// (1) multiply reversed samples right
|
||
|
"vmlal.s16 q4, d7, d16 \n"// (1) multiply reversed samples right
|
||
|
"vmlal.s16 q0, d10, d20 \n"// (1) multiply samples left
|
||
|
"vmlal.s16 q0, d11, d21 \n"// (1) multiply samples left
|
||
|
"vmlal.s16 q4, d12, d20 \n"// (1) multiply samples right
|
||
|
"vmlal.s16 q4, d13, d21 \n"// (1) multiply samples right
|
||
|
|
||
|
// moving these ARM before neon seems to be slower
|
||
|
"subs %[count], %[count], #8 \n"// (1) update loop counter
|
||
|
"sub %[sP], %[sP], #32 \n"// (0) move pointer to next set of samples
|
||
|
|
||
|
// sP used after branch (warning)
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_STEREO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[coefsP1] "+r" (coefsP1),
|
||
|
[coefsN1] "+r" (coefsN1),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [lerpP] "r" (lerpP),
|
||
|
[vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q4", "q5", "q6",
|
||
|
"q8", "q9", "q10", "q11"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void ProcessL<1, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int32_t* coefsP,
|
||
|
const int32_t* coefsN,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
|
||
|
#else
|
||
|
const int CHANNELS = 1; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"veor q0, q0, q0 \n"// result, initialize to 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
|
||
|
"vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
|
||
|
"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
|
||
|
|
||
|
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples
|
||
|
"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples
|
||
|
"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples
|
||
|
"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples
|
||
|
|
||
|
"vadd.s32 q0, q0, q12 \n"// accumulate result
|
||
|
"vadd.s32 q13, q13, q14 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q15 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q13 \n"// accumulate result
|
||
|
|
||
|
"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
|
||
|
"subs %[count], %[count], #8 \n"// update loop counter
|
||
|
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_MONO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q8", "q9", "q10", "q11",
|
||
|
"q12", "q13", "q14", "q15"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void ProcessL<2, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int32_t* coefsP,
|
||
|
const int32_t* coefsN,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
|
||
|
#else
|
||
|
const int CHANNELS = 2; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"veor q0, q0, q0 \n"// result, initialize to 0
|
||
|
"veor q4, q4, q4 \n"// result, initialize to 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
|
||
|
"vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
|
||
|
"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q10, q11}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
|
||
|
"vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
|
||
|
|
||
|
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
|
||
|
"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
|
||
|
"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
|
||
|
"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
|
||
|
|
||
|
"vadd.s32 q0, q0, q12 \n"// accumulate result
|
||
|
"vadd.s32 q13, q13, q14 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q15 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q13 \n"// accumulate result
|
||
|
|
||
|
"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by coef
|
||
|
"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by coef
|
||
|
"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by coef
|
||
|
"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by coef
|
||
|
|
||
|
"vadd.s32 q4, q4, q12 \n"// accumulate result
|
||
|
"vadd.s32 q13, q13, q14 \n"// accumulate result
|
||
|
"vadd.s32 q4, q4, q15 \n"// accumulate result
|
||
|
"vadd.s32 q4, q4, q13 \n"// accumulate result
|
||
|
|
||
|
"subs %[count], %[count], #8 \n"// update loop counter
|
||
|
"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
|
||
|
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_STEREO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q4", "q5", "q6",
|
||
|
"q8", "q9", "q10", "q11",
|
||
|
"q12", "q13", "q14", "q15"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void Process<1, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int32_t* coefsP,
|
||
|
const int32_t* coefsN,
|
||
|
const int32_t* coefsP1,
|
||
|
const int32_t* coefsN1,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
uint32_t lerpP,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
lerpP, coefsP1, coefsN1);
|
||
|
#else
|
||
|
const int CHANNELS = 1; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
|
||
|
"veor q0, q0, q0 \n"// result, initialize to 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld1.16 {q2}, [%[sP]] \n"// load 8 16-bits mono samples
|
||
|
"vld1.16 {q3}, [%[sN]]! \n"// load 8 16-bits mono samples
|
||
|
"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
|
||
|
|
||
|
"vsub.s32 q12, q12, q8 \n"// interpolate (step1)
|
||
|
"vsub.s32 q13, q13, q9 \n"// interpolate (step1)
|
||
|
"vsub.s32 q14, q14, q10 \n"// interpolate (step1)
|
||
|
"vsub.s32 q15, q15, q11 \n"// interpolate (step1)
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
|
||
|
"vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
|
||
|
"vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
|
||
|
"vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
|
||
|
|
||
|
"vadd.s32 q8, q8, q12 \n"// interpolate (step3)
|
||
|
"vadd.s32 q9, q9, q13 \n"// interpolate (step3)
|
||
|
"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
|
||
|
"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// reverse 8 samples of the positive side
|
||
|
|
||
|
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vshll.s16 q14, d6, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q15, d7, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
|
||
|
|
||
|
"vadd.s32 q0, q0, q12 \n"// accumulate result
|
||
|
"vadd.s32 q13, q13, q14 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q15 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q13 \n"// accumulate result
|
||
|
|
||
|
"sub %[sP], %[sP], #16 \n"// move pointer to next set of samples
|
||
|
"subs %[count], %[count], #8 \n"// update loop counter
|
||
|
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_MONO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[coefsP1] "+r" (coefsP1),
|
||
|
[coefsN1] "+r" (coefsN1),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [lerpP] "r" (lerpP),
|
||
|
[vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q8", "q9", "q10", "q11",
|
||
|
"q12", "q13", "q14", "q15"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template <>
|
||
|
inline void Process<2, 16>(int32_t* const out,
|
||
|
int count,
|
||
|
const int32_t* coefsP,
|
||
|
const int32_t* coefsN,
|
||
|
const int32_t* coefsP1,
|
||
|
const int32_t* coefsN1,
|
||
|
const int16_t* sP,
|
||
|
const int16_t* sN,
|
||
|
uint32_t lerpP,
|
||
|
const int32_t* const volumeLR)
|
||
|
{
|
||
|
#ifdef USE_INTRINSIC
|
||
|
ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
lerpP, coefsP1, coefsN1);
|
||
|
#else
|
||
|
const int CHANNELS = 2; // template specialization does not preserve params
|
||
|
const int STRIDE = 16;
|
||
|
sP -= CHANNELS*((STRIDE>>1)-1);
|
||
|
asm (
|
||
|
"vmov.32 d2[0], %[lerpP] \n"// load the positive phase
|
||
|
"veor q0, q0, q0 \n"// result, initialize to 0
|
||
|
"veor q4, q4, q4 \n"// result, initialize to 0
|
||
|
|
||
|
"1: \n"
|
||
|
|
||
|
"vld2.16 {q2, q3}, [%[sP]] \n"// load 8 16-bits stereo frames
|
||
|
"vld2.16 {q5, q6}, [%[sN]]! \n"// load 8 16-bits stereo frames
|
||
|
"vld1.32 {q8, q9}, [%[coefsP0]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q12, q13}, [%[coefsP1]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q10, q11}, [%[coefsN1]:128]! \n"// load 8 32-bits coefs
|
||
|
"vld1.32 {q14, q15}, [%[coefsN0]:128]! \n"// load 8 32-bits coefs
|
||
|
|
||
|
"vsub.s32 q12, q12, q8 \n"// interpolate (step1)
|
||
|
"vsub.s32 q13, q13, q9 \n"// interpolate (step1)
|
||
|
"vsub.s32 q14, q14, q10 \n"// interpolate (step1)
|
||
|
"vsub.s32 q15, q15, q11 \n"// interpolate (step1)
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, d2[0] \n"// interpolate (step2)
|
||
|
"vqrdmulh.s32 q13, q13, d2[0] \n"// interpolate (step2)
|
||
|
"vqrdmulh.s32 q14, q14, d2[0] \n"// interpolate (step2)
|
||
|
"vqrdmulh.s32 q15, q15, d2[0] \n"// interpolate (step2)
|
||
|
|
||
|
"vadd.s32 q8, q8, q12 \n"// interpolate (step3)
|
||
|
"vadd.s32 q9, q9, q13 \n"// interpolate (step3)
|
||
|
"vadd.s32 q10, q10, q14 \n"// interpolate (step3)
|
||
|
"vadd.s32 q11, q11, q15 \n"// interpolate (step3)
|
||
|
|
||
|
"vrev64.16 q2, q2 \n"// reverse 8 samples of positive left
|
||
|
"vrev64.16 q3, q3 \n"// reverse 8 samples of positive right
|
||
|
|
||
|
"vshll.s16 q12, d4, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q13, d5, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vshll.s16 q14, d10, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q15, d11, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
|
||
|
|
||
|
"vadd.s32 q0, q0, q12 \n"// accumulate result
|
||
|
"vadd.s32 q13, q13, q14 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q15 \n"// accumulate result
|
||
|
"vadd.s32 q0, q0, q13 \n"// accumulate result
|
||
|
|
||
|
"vshll.s16 q12, d6, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q13, d7, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vshll.s16 q14, d12, #15 \n"// extend samples to 31 bits
|
||
|
"vshll.s16 q15, d13, #15 \n"// extend samples to 31 bits
|
||
|
|
||
|
"vqrdmulh.s32 q12, q12, q9 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q13, q13, q8 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q14, q14, q10 \n"// multiply samples by interpolated coef
|
||
|
"vqrdmulh.s32 q15, q15, q11 \n"// multiply samples by interpolated coef
|
||
|
|
||
|
"vadd.s32 q4, q4, q12 \n"// accumulate result
|
||
|
"vadd.s32 q13, q13, q14 \n"// accumulate result
|
||
|
"vadd.s32 q4, q4, q15 \n"// accumulate result
|
||
|
"vadd.s32 q4, q4, q13 \n"// accumulate result
|
||
|
|
||
|
"subs %[count], %[count], #8 \n"// update loop counter
|
||
|
"sub %[sP], %[sP], #32 \n"// move pointer to next set of samples
|
||
|
|
||
|
"bne 1b \n"// loop
|
||
|
|
||
|
ASSEMBLY_ACCUMULATE_STEREO
|
||
|
|
||
|
: [out] "=Uv" (out[0]),
|
||
|
[count] "+r" (count),
|
||
|
[coefsP0] "+r" (coefsP),
|
||
|
[coefsN0] "+r" (coefsN),
|
||
|
[coefsP1] "+r" (coefsP1),
|
||
|
[coefsN1] "+r" (coefsN1),
|
||
|
[sP] "+r" (sP),
|
||
|
[sN] "+r" (sN)
|
||
|
: [lerpP] "r" (lerpP),
|
||
|
[vLR] "r" (volumeLR)
|
||
|
: "cc", "memory",
|
||
|
"q0", "q1", "q2", "q3",
|
||
|
"q4", "q5", "q6",
|
||
|
"q8", "q9", "q10", "q11",
|
||
|
"q12", "q13", "q14", "q15"
|
||
|
);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
template<>
|
||
|
inline void ProcessL<1, 16>(float* const out,
|
||
|
int count,
|
||
|
const float* coefsP,
|
||
|
const float* coefsN,
|
||
|
const float* sP,
|
||
|
const float* sN,
|
||
|
const float* const volumeLR)
|
||
|
{
|
||
|
ProcessNeonIntrinsic<1, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
|
||
|
}
|
||
|
|
||
|
template<>
|
||
|
inline void ProcessL<2, 16>(float* const out,
|
||
|
int count,
|
||
|
const float* coefsP,
|
||
|
const float* coefsN,
|
||
|
const float* sP,
|
||
|
const float* sN,
|
||
|
const float* const volumeLR)
|
||
|
{
|
||
|
ProcessNeonIntrinsic<2, 16, true>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
0 /*lerpP*/, NULL /*coefsP1*/, NULL /*coefsN1*/);
|
||
|
}
|
||
|
|
||
|
template<>
|
||
|
inline void Process<1, 16>(float* const out,
|
||
|
int count,
|
||
|
const float* coefsP,
|
||
|
const float* coefsN,
|
||
|
const float* coefsP1,
|
||
|
const float* coefsN1,
|
||
|
const float* sP,
|
||
|
const float* sN,
|
||
|
float lerpP,
|
||
|
const float* const volumeLR)
|
||
|
{
|
||
|
ProcessNeonIntrinsic<1, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
lerpP, coefsP1, coefsN1);
|
||
|
}
|
||
|
|
||
|
template<>
|
||
|
inline void Process<2, 16>(float* const out,
|
||
|
int count,
|
||
|
const float* coefsP,
|
||
|
const float* coefsN,
|
||
|
const float* coefsP1,
|
||
|
const float* coefsN1,
|
||
|
const float* sP,
|
||
|
const float* sN,
|
||
|
float lerpP,
|
||
|
const float* const volumeLR)
|
||
|
{
|
||
|
ProcessNeonIntrinsic<2, 16, false>(out, count, coefsP, coefsN, sP, sN, volumeLR,
|
||
|
lerpP, coefsP1, coefsN1);
|
||
|
}
|
||
|
|
||
|
#endif //USE_NEON
|
||
|
|
||
|
} // namespace android
|
||
|
|
||
|
#endif /*ANDROID_AUDIO_RESAMPLER_FIR_PROCESS_NEON_H*/
|