mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-13 22:00:47 +01:00

In https://webrtc-review.googlesource.com/c/src/+/1560 we moved WebRTC from src/webrtc to src/ (in order to preserve an healthy git history). This CL takes care of fixing header guards, #include paths, etc... NOPRESUBMIT=true NOTREECHECKS=true NOTRY=true TBR=tommi@webrtc.org Bug: chromium:611808 Change-Id: Iea91618212bee0af16aa3f05071eab8f93706578 Reviewed-on: https://webrtc-review.googlesource.com/1561 Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org> Reviewed-by: Henrik Kjellander <kjellander@webrtc.org> Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org> Cr-Commit-Position: refs/heads/master@{#19846}
195 lines
6.4 KiB
C
195 lines
6.4 KiB
C
/*
|
|
* Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include <arm_neon.h>
|
|
|
|
#include "modules/audio_coding/codecs/isac/fix/source/codec.h"
|
|
#include "modules/audio_coding/codecs/isac/fix/source/settings.h"
|
|
|
|
// Contains a function for the core loop in the normalized lattice MA
|
|
// filter routine for iSAC codec, optimized for ARM Neon platform.
|
|
// It does:
|
|
// for 0 <= n < HALF_SUBFRAMELEN - 1:
|
|
// *ptr2 = input2 * ((*ptr2) + input0 * (*ptr0));
|
|
// *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
|
|
// Output is not bit-exact with the reference C code, due to the replacement
|
|
// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
|
|
// instructions. The difference should not be bigger than 1.
|
|
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient
|
|
int16_t input1, // Filter coefficient
|
|
int32_t input2, // Inverse coefficient
|
|
int32_t* ptr0, // Sample buffer
|
|
int32_t* ptr1, // Sample buffer
|
|
int32_t* ptr2) // Sample buffer
|
|
{
|
|
int n = 0;
|
|
int loop = (HALF_SUBFRAMELEN - 1) >> 3;
|
|
int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;
|
|
|
|
int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
|
|
int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
|
|
int32x4_t input2_v = vdupq_n_s32(input2);
|
|
int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
|
|
int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
|
|
int32x4_t ptr0va, ptr1va, ptr2va;
|
|
int32x4_t ptr0vb, ptr1vb, ptr2vb;
|
|
|
|
int64x2_t tmp2al_low, tmp2al_high, tmp2bl_low, tmp2bl_high;
|
|
// Unroll to process 8 samples at once.
|
|
for (n = 0; n < loop; n++) {
|
|
ptr0va = vld1q_s32(ptr0);
|
|
ptr0vb = vld1q_s32(ptr0 + 4);
|
|
ptr0 += 8;
|
|
|
|
ptr2va = vld1q_s32(ptr2);
|
|
ptr2vb = vld1q_s32(ptr2 + 4);
|
|
|
|
// Calculate tmp0 = (*ptr0) * input0.
|
|
tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
|
|
tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);
|
|
|
|
// Calculate tmp1 = (*ptr0) * input1.
|
|
tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
|
|
tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);
|
|
|
|
// Calculate tmp2 = tmp0 + *(ptr2).
|
|
tmp2a = vaddq_s32(tmp0a, ptr2va);
|
|
tmp2b = vaddq_s32(tmp0b, ptr2vb);
|
|
|
|
// Calculate *ptr2 = input2 * tmp2.
|
|
tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
|
|
#if defined(WEBRTC_ARCH_ARM64)
|
|
tmp2al_high = vmull_high_s32(tmp2a, input2_v);
|
|
#else
|
|
tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
|
|
#endif
|
|
ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
|
|
vrshrn_n_s64(tmp2al_high, 16));
|
|
|
|
tmp2bl_low = vmull_s32(vget_low_s32(tmp2b), vget_low_s32(input2_v));
|
|
#if defined(WEBRTC_ARCH_ARM64)
|
|
tmp2bl_high = vmull_high_s32(tmp2b, input2_v);
|
|
#else
|
|
tmp2bl_high = vmull_s32(vget_high_s32(tmp2b), vget_high_s32(input2_v));
|
|
#endif
|
|
ptr2vb = vcombine_s32(vrshrn_n_s64(tmp2bl_low, 16),
|
|
vrshrn_n_s64(tmp2bl_high, 16));
|
|
|
|
vst1q_s32(ptr2, ptr2va);
|
|
vst1q_s32(ptr2 + 4, ptr2vb);
|
|
ptr2 += 8;
|
|
|
|
// Calculate tmp3 = ptr2v * input0.
|
|
tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
|
|
tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);
|
|
|
|
// Calculate *ptr1 = tmp1 + tmp3.
|
|
ptr1va = vaddq_s32(tmp1a, tmp3a);
|
|
ptr1vb = vaddq_s32(tmp1b, tmp3b);
|
|
|
|
vst1q_s32(ptr1, ptr1va);
|
|
vst1q_s32(ptr1 + 4, ptr1vb);
|
|
ptr1 += 8;
|
|
}
|
|
|
|
// Process four more samples.
|
|
if (loop_tail & 0x4) {
|
|
ptr0va = vld1q_s32(ptr0);
|
|
ptr2va = vld1q_s32(ptr2);
|
|
ptr0 += 4;
|
|
|
|
// Calculate tmp0 = (*ptr0) * input0.
|
|
tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
|
|
|
|
// Calculate tmp1 = (*ptr0) * input1.
|
|
tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
|
|
|
|
// Calculate tmp2 = tmp0 + *(ptr2).
|
|
tmp2a = vaddq_s32(tmp0a, ptr2va);
|
|
|
|
// Calculate *ptr2 = input2 * tmp2.
|
|
tmp2al_low = vmull_s32(vget_low_s32(tmp2a), vget_low_s32(input2_v));
|
|
|
|
#if defined(WEBRTC_ARCH_ARM64)
|
|
tmp2al_high = vmull_high_s32(tmp2a, input2_v);
|
|
#else
|
|
tmp2al_high = vmull_s32(vget_high_s32(tmp2a), vget_high_s32(input2_v));
|
|
#endif
|
|
ptr2va = vcombine_s32(vrshrn_n_s64(tmp2al_low, 16),
|
|
vrshrn_n_s64(tmp2al_high, 16));
|
|
|
|
vst1q_s32(ptr2, ptr2va);
|
|
ptr2 += 4;
|
|
|
|
// Calculate tmp3 = *(ptr2) * input0.
|
|
tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
|
|
|
|
// Calculate *ptr1 = tmp1 + tmp3.
|
|
ptr1va = vaddq_s32(tmp1a, tmp3a);
|
|
|
|
vst1q_s32(ptr1, ptr1va);
|
|
ptr1 += 4;
|
|
}
|
|
|
|
// Process two more samples.
|
|
if (loop_tail & 0x2) {
|
|
int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
|
|
int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
|
|
int64x2_t tmp2l_tail;
|
|
ptr0v_tail = vld1_s32(ptr0);
|
|
ptr2v_tail = vld1_s32(ptr2);
|
|
ptr0 += 2;
|
|
|
|
// Calculate tmp0 = (*ptr0) * input0.
|
|
tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));
|
|
|
|
// Calculate tmp1 = (*ptr0) * input1.
|
|
tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));
|
|
|
|
// Calculate tmp2 = tmp0 + *(ptr2).
|
|
tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
|
|
|
|
// Calculate *ptr2 = input2 * tmp2.
|
|
tmp2l_tail = vmull_s32(tmp2_tail, vget_low_s32(input2_v));
|
|
ptr2v_tail = vrshrn_n_s64(tmp2l_tail, 16);
|
|
|
|
vst1_s32(ptr2, ptr2v_tail);
|
|
ptr2 += 2;
|
|
|
|
// Calculate tmp3 = *(ptr2) * input0.
|
|
tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));
|
|
|
|
// Calculate *ptr1 = tmp1 + tmp3.
|
|
ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);
|
|
|
|
vst1_s32(ptr1, ptr1v_tail);
|
|
ptr1 += 2;
|
|
}
|
|
|
|
// Process one more sample.
|
|
if (loop_tail & 0x1) {
|
|
int16_t t16a = (int16_t)(input2 >> 16);
|
|
int16_t t16b = (int16_t)input2;
|
|
if (t16b < 0) t16a++;
|
|
int32_t tmp32a;
|
|
int32_t tmp32b;
|
|
|
|
// Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
|
|
tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
|
|
tmp32b = *ptr2 + tmp32a;
|
|
*ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
|
|
(WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));
|
|
|
|
// Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
|
|
tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
|
|
tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
|
|
*ptr1 = tmp32a + tmp32b;
|
|
}
|
|
}
|