mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-16 07:10:38 +01:00
Support AVX2/FMA intrinsics in Audio Resampler module
From the test result, using AVX2/FMA is 1.60x faster than SSE on atlas. Bug: webrtc:11663 Test: common_audio_unittests on atlas and octopus. Change-Id: Ibd45ea46aa97d5790a24e5116f741592b95f6416 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/176382 Reviewed-by: Per Åhgren <peah@webrtc.org> Reviewed-by: Henrik Andreassson <henrika@webrtc.org> Reviewed-by: Mirko Bonadei <mbonadei@webrtc.org> Reviewed-by: Sam Zackrisson <saza@webrtc.org> Commit-Queue: Sam Zackrisson <saza@webrtc.org> Cr-Commit-Position: refs/heads/master@{#31810}
This commit is contained in:
parent
6f148566dc
commit
1ca8d87239
9 changed files with 168 additions and 59 deletions
4
BUILD.gn
4
BUILD.gn
|
@ -129,6 +129,10 @@ config("common_inherited_config") {
|
||||||
defines += [ "RTC_DISABLE_CHECK_MSG" ]
|
defines += [ "RTC_DISABLE_CHECK_MSG" ]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (rtc_enable_avx2) {
|
||||||
|
defines += [ "WEBRTC_ENABLE_AVX2" ]
|
||||||
|
}
|
||||||
|
|
||||||
# Some tests need to declare their own trace event handlers. If this define is
|
# Some tests need to declare their own trace event handlers. If this define is
|
||||||
# not set, the first time TRACE_EVENT_* is called it will store the return
|
# not set, the first time TRACE_EVENT_* is called it will store the return
|
||||||
# value for the current handler in an static variable, so that subsequent
|
# value for the current handler in an static variable, so that subsequent
|
||||||
|
|
|
@ -67,6 +67,7 @@ rtc_library("common_audio") {
|
||||||
|
|
||||||
if (current_cpu == "x86" || current_cpu == "x64") {
|
if (current_cpu == "x86" || current_cpu == "x64") {
|
||||||
deps += [ ":common_audio_sse2" ]
|
deps += [ ":common_audio_sse2" ]
|
||||||
|
deps += [ ":common_audio_avx2" ]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -235,6 +236,7 @@ rtc_library("fir_filter_factory") {
|
||||||
]
|
]
|
||||||
if (current_cpu == "x86" || current_cpu == "x64") {
|
if (current_cpu == "x86" || current_cpu == "x64") {
|
||||||
deps += [ ":common_audio_sse2" ]
|
deps += [ ":common_audio_sse2" ]
|
||||||
|
deps += [ ":common_audio_avx2" ]
|
||||||
}
|
}
|
||||||
if (rtc_build_with_neon) {
|
if (rtc_build_with_neon) {
|
||||||
deps += [ ":common_audio_neon" ]
|
deps += [ ":common_audio_neon" ]
|
||||||
|
@ -261,6 +263,27 @@ if (current_cpu == "x86" || current_cpu == "x64") {
|
||||||
"../rtc_base/memory:aligned_malloc",
|
"../rtc_base/memory:aligned_malloc",
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rtc_library("common_audio_avx2") {
|
||||||
|
sources = [ "resampler/sinc_resampler_avx2.cc" ]
|
||||||
|
|
||||||
|
if (is_win) {
|
||||||
|
cflags = [ "/arch:AVX2" ]
|
||||||
|
} else {
|
||||||
|
cflags = [
|
||||||
|
"-mavx2",
|
||||||
|
"-mfma",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
deps = [
|
||||||
|
":fir_filter",
|
||||||
|
":sinc_resampler",
|
||||||
|
"../rtc_base:checks",
|
||||||
|
"../rtc_base:rtc_base_approved",
|
||||||
|
"../rtc_base/memory:aligned_malloc",
|
||||||
|
]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rtc_build_with_neon) {
|
if (rtc_build_with_neon) {
|
||||||
|
|
|
@ -122,28 +122,22 @@ double SincScaleFactor(double io_ratio) {
|
||||||
const size_t SincResampler::kKernelSize;
|
const size_t SincResampler::kKernelSize;
|
||||||
|
|
||||||
// If we know the minimum architecture at compile time, avoid CPU detection.
|
// If we know the minimum architecture at compile time, avoid CPU detection.
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
||||||
#if defined(__SSE2__)
|
|
||||||
#define CONVOLVE_FUNC Convolve_SSE
|
|
||||||
void SincResampler::InitializeCPUSpecificFeatures() {}
|
|
||||||
#else
|
|
||||||
// x86 CPU detection required. Function will be set by
|
|
||||||
// InitializeCPUSpecificFeatures().
|
|
||||||
// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed.
|
|
||||||
#define CONVOLVE_FUNC convolve_proc_
|
|
||||||
|
|
||||||
void SincResampler::InitializeCPUSpecificFeatures() {
|
void SincResampler::InitializeCPUSpecificFeatures() {
|
||||||
convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C;
|
#if defined(WEBRTC_HAS_NEON)
|
||||||
}
|
convolve_proc_ = Convolve_NEON;
|
||||||
#endif
|
#elif defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
#elif defined(WEBRTC_HAS_NEON)
|
// Using AVX2 instead of SSE2 when AVX2 supported.
|
||||||
#define CONVOLVE_FUNC Convolve_NEON
|
if (WebRtc_GetCPUInfo(kAVX2))
|
||||||
void SincResampler::InitializeCPUSpecificFeatures() {}
|
convolve_proc_ = Convolve_AVX2;
|
||||||
|
else if (WebRtc_GetCPUInfo(kSSE2))
|
||||||
|
convolve_proc_ = Convolve_SSE;
|
||||||
|
else
|
||||||
|
convolve_proc_ = Convolve_C;
|
||||||
#else
|
#else
|
||||||
// Unknown architecture.
|
// Unknown architecture.
|
||||||
#define CONVOLVE_FUNC Convolve_C
|
convolve_proc_ = Convolve_C;
|
||||||
void SincResampler::InitializeCPUSpecificFeatures() {}
|
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
SincResampler::SincResampler(double io_sample_rate_ratio,
|
SincResampler::SincResampler(double io_sample_rate_ratio,
|
||||||
size_t request_frames,
|
size_t request_frames,
|
||||||
|
@ -152,24 +146,20 @@ SincResampler::SincResampler(double io_sample_rate_ratio,
|
||||||
read_cb_(read_cb),
|
read_cb_(read_cb),
|
||||||
request_frames_(request_frames),
|
request_frames_(request_frames),
|
||||||
input_buffer_size_(request_frames_ + kKernelSize),
|
input_buffer_size_(request_frames_ + kKernelSize),
|
||||||
// Create input buffers with a 16-byte alignment for SSE optimizations.
|
// Create input buffers with a 32-byte alignment for SIMD optimizations.
|
||||||
kernel_storage_(static_cast<float*>(
|
kernel_storage_(static_cast<float*>(
|
||||||
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
|
AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
|
||||||
kernel_pre_sinc_storage_(static_cast<float*>(
|
kernel_pre_sinc_storage_(static_cast<float*>(
|
||||||
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
|
AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
|
||||||
kernel_window_storage_(static_cast<float*>(
|
kernel_window_storage_(static_cast<float*>(
|
||||||
AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))),
|
AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))),
|
||||||
input_buffer_(static_cast<float*>(
|
input_buffer_(static_cast<float*>(
|
||||||
AlignedMalloc(sizeof(float) * input_buffer_size_, 16))),
|
AlignedMalloc(sizeof(float) * input_buffer_size_, 32))),
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
|
|
||||||
convolve_proc_(nullptr),
|
convolve_proc_(nullptr),
|
||||||
#endif
|
|
||||||
r1_(input_buffer_.get()),
|
r1_(input_buffer_.get()),
|
||||||
r2_(input_buffer_.get() + kKernelSize / 2) {
|
r2_(input_buffer_.get() + kKernelSize / 2) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
|
|
||||||
InitializeCPUSpecificFeatures();
|
InitializeCPUSpecificFeatures();
|
||||||
RTC_DCHECK(convolve_proc_);
|
RTC_DCHECK(convolve_proc_);
|
||||||
#endif
|
|
||||||
RTC_DCHECK_GT(request_frames_, 0);
|
RTC_DCHECK_GT(request_frames_, 0);
|
||||||
Flush();
|
Flush();
|
||||||
RTC_DCHECK_GT(block_size_, kKernelSize);
|
RTC_DCHECK_GT(block_size_, kKernelSize);
|
||||||
|
@ -302,10 +292,10 @@ void SincResampler::Resample(size_t frames, float* destination) {
|
||||||
const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
|
const float* const k1 = kernel_ptr + offset_idx * kKernelSize;
|
||||||
const float* const k2 = k1 + kKernelSize;
|
const float* const k2 = k1 + kKernelSize;
|
||||||
|
|
||||||
// Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be
|
// Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always be
|
||||||
// true so long as kKernelSize is a multiple of 16.
|
// true so long as kKernelSize is a multiple of 32.
|
||||||
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 16);
|
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k1) % 32);
|
||||||
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 16);
|
RTC_DCHECK_EQ(0, reinterpret_cast<uintptr_t>(k2) % 32);
|
||||||
|
|
||||||
// Initialize input pointer based on quantized |virtual_source_idx_|.
|
// Initialize input pointer based on quantized |virtual_source_idx_|.
|
||||||
const float* const input_ptr = r1_ + source_idx;
|
const float* const input_ptr = r1_ + source_idx;
|
||||||
|
@ -314,7 +304,7 @@ void SincResampler::Resample(size_t frames, float* destination) {
|
||||||
const double kernel_interpolation_factor =
|
const double kernel_interpolation_factor =
|
||||||
virtual_offset_idx - offset_idx;
|
virtual_offset_idx - offset_idx;
|
||||||
*destination++ =
|
*destination++ =
|
||||||
CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor);
|
convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor);
|
||||||
|
|
||||||
// Advance the virtual index.
|
// Advance the virtual index.
|
||||||
virtual_source_idx_ += current_io_ratio;
|
virtual_source_idx_ += current_io_ratio;
|
||||||
|
|
|
@ -112,6 +112,10 @@ class SincResampler {
|
||||||
const float* k1,
|
const float* k1,
|
||||||
const float* k2,
|
const float* k2,
|
||||||
double kernel_interpolation_factor);
|
double kernel_interpolation_factor);
|
||||||
|
static float Convolve_AVX2(const float* input_ptr,
|
||||||
|
const float* k1,
|
||||||
|
const float* k2,
|
||||||
|
double kernel_interpolation_factor);
|
||||||
#elif defined(WEBRTC_HAS_NEON)
|
#elif defined(WEBRTC_HAS_NEON)
|
||||||
static float Convolve_NEON(const float* input_ptr,
|
static float Convolve_NEON(const float* input_ptr,
|
||||||
const float* k1,
|
const float* k1,
|
||||||
|
@ -155,13 +159,11 @@ class SincResampler {
|
||||||
// TODO(ajm): Move to using a global static which must only be initialized
|
// TODO(ajm): Move to using a global static which must only be initialized
|
||||||
// once by the user. We're not doing this initially, because we don't have
|
// once by the user. We're not doing this initially, because we don't have
|
||||||
// e.g. a LazyInstance helper in webrtc.
|
// e.g. a LazyInstance helper in webrtc.
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__)
|
|
||||||
typedef float (*ConvolveProc)(const float*,
|
typedef float (*ConvolveProc)(const float*,
|
||||||
const float*,
|
const float*,
|
||||||
const float*,
|
const float*,
|
||||||
double);
|
double);
|
||||||
ConvolveProc convolve_proc_;
|
ConvolveProc convolve_proc_;
|
||||||
#endif
|
|
||||||
|
|
||||||
// Pointers to the various regions inside |input_buffer_|. See the diagram at
|
// Pointers to the various regions inside |input_buffer_|. See the diagram at
|
||||||
// the top of the .cc file for more information.
|
// the top of the .cc file for more information.
|
||||||
|
|
66
common_audio/resampler/sinc_resampler_avx2.cc
Normal file
66
common_audio/resampler/sinc_resampler_avx2.cc
Normal file
|
@ -0,0 +1,66 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2020 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
|
||||||
|
#include "common_audio/resampler/sinc_resampler.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
|
||||||
|
float SincResampler::Convolve_AVX2(const float* input_ptr,
|
||||||
|
const float* k1,
|
||||||
|
const float* k2,
|
||||||
|
double kernel_interpolation_factor) {
|
||||||
|
__m256 m_input;
|
||||||
|
__m256 m_sums1 = _mm256_setzero_ps();
|
||||||
|
__m256 m_sums2 = _mm256_setzero_ps();
|
||||||
|
|
||||||
|
// Based on |input_ptr| alignment, we need to use loadu or load. Unrolling
|
||||||
|
// these loops has not been tested or benchmarked.
|
||||||
|
bool aligned_input = (reinterpret_cast<uintptr_t>(input_ptr) & 0x1F) == 0;
|
||||||
|
if (!aligned_input) {
|
||||||
|
for (size_t i = 0; i < kKernelSize; i += 8) {
|
||||||
|
m_input = _mm256_loadu_ps(input_ptr + i);
|
||||||
|
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
|
||||||
|
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < kKernelSize; i += 8) {
|
||||||
|
m_input = _mm256_load_ps(input_ptr + i);
|
||||||
|
m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1);
|
||||||
|
m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Linearly interpolate the two "convolutions".
|
||||||
|
__m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0),
|
||||||
|
_mm256_extractf128_ps(m_sums1, 1));
|
||||||
|
__m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0),
|
||||||
|
_mm256_extractf128_ps(m_sums2, 1));
|
||||||
|
m128_sums1 = _mm_mul_ps(
|
||||||
|
m128_sums1,
|
||||||
|
_mm_set_ps1(static_cast<float>(1.0 - kernel_interpolation_factor)));
|
||||||
|
m128_sums2 = _mm_mul_ps(
|
||||||
|
m128_sums2, _mm_set_ps1(static_cast<float>(kernel_interpolation_factor)));
|
||||||
|
m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2);
|
||||||
|
|
||||||
|
// Sum components together.
|
||||||
|
float result;
|
||||||
|
m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1);
|
||||||
|
_mm_store_ss(&result, _mm_add_ss(m128_sums2,
|
||||||
|
_mm_shuffle_ps(m128_sums2, m128_sums2, 1)));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace webrtc
|
|
@ -116,17 +116,9 @@ TEST(SincResamplerTest, DISABLED_SetRatioBench) {
|
||||||
printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
|
printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Define platform independent function name for Convolve* tests.
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
|
||||||
#define CONVOLVE_FUNC Convolve_SSE
|
|
||||||
#elif defined(WEBRTC_ARCH_ARM_V7)
|
|
||||||
#define CONVOLVE_FUNC Convolve_NEON
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Ensure various optimized Convolve() methods return the same value. Only run
|
// Ensure various optimized Convolve() methods return the same value. Only run
|
||||||
// this test if other optimized methods exist, otherwise the default Convolve()
|
// this test if other optimized methods exist, otherwise the default Convolve()
|
||||||
// will be tested by the parameterized SincResampler tests below.
|
// will be tested by the parameterized SincResampler tests below.
|
||||||
#if defined(CONVOLVE_FUNC)
|
|
||||||
TEST(SincResamplerTest, Convolve) {
|
TEST(SincResamplerTest, Convolve) {
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
|
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
|
||||||
|
@ -148,7 +140,7 @@ TEST(SincResamplerTest, Convolve) {
|
||||||
double result = resampler.Convolve_C(
|
double result = resampler.Convolve_C(
|
||||||
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
|
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
|
||||||
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
||||||
double result2 = resampler.CONVOLVE_FUNC(
|
double result2 = resampler.convolve_proc_(
|
||||||
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
|
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
|
||||||
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
||||||
EXPECT_NEAR(result2, result, kEpsilon);
|
EXPECT_NEAR(result2, result, kEpsilon);
|
||||||
|
@ -157,12 +149,11 @@ TEST(SincResamplerTest, Convolve) {
|
||||||
result = resampler.Convolve_C(
|
result = resampler.Convolve_C(
|
||||||
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
|
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
|
||||||
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
||||||
result2 = resampler.CONVOLVE_FUNC(
|
result2 = resampler.convolve_proc_(
|
||||||
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
|
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
|
||||||
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
||||||
EXPECT_NEAR(result2, result, kEpsilon);
|
EXPECT_NEAR(result2, result, kEpsilon);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
// Benchmark for the various Convolve() methods. Make sure to build with
|
// Benchmark for the various Convolve() methods. Make sure to build with
|
||||||
// branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking.
|
// branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking.
|
||||||
|
@ -190,7 +181,6 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
|
||||||
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
|
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
|
||||||
printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000);
|
printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000);
|
||||||
|
|
||||||
#if defined(CONVOLVE_FUNC)
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
|
ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2));
|
||||||
#elif defined(WEBRTC_ARCH_ARM_V7)
|
#elif defined(WEBRTC_ARCH_ARM_V7)
|
||||||
|
@ -200,36 +190,33 @@ TEST(SincResamplerTest, ConvolveBenchmark) {
|
||||||
// Benchmark with unaligned input pointer.
|
// Benchmark with unaligned input pointer.
|
||||||
start = rtc::TimeNanos();
|
start = rtc::TimeNanos();
|
||||||
for (int j = 0; j < kConvolveIterations; ++j) {
|
for (int j = 0; j < kConvolveIterations; ++j) {
|
||||||
resampler.CONVOLVE_FUNC(
|
resampler.convolve_proc_(
|
||||||
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
|
resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(),
|
||||||
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
||||||
}
|
}
|
||||||
double total_time_optimized_unaligned_us =
|
double total_time_optimized_unaligned_us =
|
||||||
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
|
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
|
||||||
printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx "
|
printf(STRINGIZE(convolve_proc_) "(unaligned) took %.2fms; which is %.2fx "
|
||||||
"faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000,
|
"faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000,
|
||||||
total_time_c_us / total_time_optimized_unaligned_us);
|
total_time_c_us / total_time_optimized_unaligned_us);
|
||||||
|
|
||||||
// Benchmark with aligned input pointer.
|
// Benchmark with aligned input pointer.
|
||||||
start = rtc::TimeNanos();
|
start = rtc::TimeNanos();
|
||||||
for (int j = 0; j < kConvolveIterations; ++j) {
|
for (int j = 0; j < kConvolveIterations; ++j) {
|
||||||
resampler.CONVOLVE_FUNC(
|
resampler.convolve_proc_(
|
||||||
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
|
resampler.kernel_storage_.get(), resampler.kernel_storage_.get(),
|
||||||
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
resampler.kernel_storage_.get(), kKernelInterpolationFactor);
|
||||||
}
|
}
|
||||||
double total_time_optimized_aligned_us =
|
double total_time_optimized_aligned_us =
|
||||||
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
|
(rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec;
|
||||||
printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx "
|
printf(STRINGIZE(convolve_proc_) " (aligned) took %.2fms; which is %.2fx "
|
||||||
"faster than Convolve_C and %.2fx faster than "
|
"faster than Convolve_C and %.2fx faster than "
|
||||||
STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n",
|
STRINGIZE(convolve_proc_) " (unaligned).\n",
|
||||||
total_time_optimized_aligned_us / 1000,
|
total_time_optimized_aligned_us / 1000,
|
||||||
total_time_c_us / total_time_optimized_aligned_us,
|
total_time_c_us / total_time_optimized_aligned_us,
|
||||||
total_time_optimized_unaligned_us / total_time_optimized_aligned_us);
|
total_time_optimized_unaligned_us / total_time_optimized_aligned_us);
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef CONVOLVE_FUNC
|
|
||||||
|
|
||||||
typedef std::tuple<int, int, double, double> SincResamplerTestData;
|
typedef std::tuple<int, int, double, double> SincResamplerTestData;
|
||||||
class SincResamplerTest
|
class SincResamplerTest
|
||||||
: public ::testing::TestWithParam<SincResamplerTestData> {
|
: public ::testing::TestWithParam<SincResamplerTestData> {
|
||||||
|
@ -352,7 +339,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||||
std::make_tuple(16000, 44100, kResamplingRMSError, -62.54),
|
std::make_tuple(16000, 44100, kResamplingRMSError, -62.54),
|
||||||
std::make_tuple(22050, 44100, kResamplingRMSError, -73.53),
|
std::make_tuple(22050, 44100, kResamplingRMSError, -73.53),
|
||||||
std::make_tuple(32000, 44100, kResamplingRMSError, -63.32),
|
std::make_tuple(32000, 44100, kResamplingRMSError, -63.32),
|
||||||
std::make_tuple(44100, 44100, kResamplingRMSError, -73.53),
|
std::make_tuple(44100, 44100, kResamplingRMSError, -73.52),
|
||||||
std::make_tuple(48000, 44100, -15.01, -64.04),
|
std::make_tuple(48000, 44100, -15.01, -64.04),
|
||||||
std::make_tuple(96000, 44100, -18.49, -25.51),
|
std::make_tuple(96000, 44100, -18.49, -25.51),
|
||||||
std::make_tuple(192000, 44100, -20.50, -13.31),
|
std::make_tuple(192000, 44100, -20.50, -13.31),
|
||||||
|
@ -360,7 +347,7 @@ INSTANTIATE_TEST_SUITE_P(
|
||||||
// To 48kHz
|
// To 48kHz
|
||||||
std::make_tuple(8000, 48000, kResamplingRMSError, -63.43),
|
std::make_tuple(8000, 48000, kResamplingRMSError, -63.43),
|
||||||
std::make_tuple(11025, 48000, kResamplingRMSError, -62.61),
|
std::make_tuple(11025, 48000, kResamplingRMSError, -62.61),
|
||||||
std::make_tuple(16000, 48000, kResamplingRMSError, -63.96),
|
std::make_tuple(16000, 48000, kResamplingRMSError, -63.95),
|
||||||
std::make_tuple(22050, 48000, kResamplingRMSError, -62.42),
|
std::make_tuple(22050, 48000, kResamplingRMSError, -62.42),
|
||||||
std::make_tuple(32000, 48000, kResamplingRMSError, -64.04),
|
std::make_tuple(32000, 48000, kResamplingRMSError, -64.04),
|
||||||
std::make_tuple(44100, 48000, kResamplingRMSError, -62.63),
|
std::make_tuple(44100, 48000, kResamplingRMSError, -62.63),
|
||||||
|
|
|
@ -18,7 +18,7 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// List of features in x86.
|
// List of features in x86.
|
||||||
typedef enum { kSSE2, kSSE3 } CPUFeature;
|
typedef enum { kSSE2, kSSE3, kAVX2 } CPUFeature;
|
||||||
|
|
||||||
// List of features in ARM.
|
// List of features in ARM.
|
||||||
enum {
|
enum {
|
||||||
|
|
|
@ -24,6 +24,20 @@ int GetCPUInfoNoASM(CPUFeature feature) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
#if defined(WEBRTC_ARCH_X86_FAMILY)
|
||||||
|
|
||||||
|
// xgetbv returns the value of an Intel Extended Control Register (XCR).
|
||||||
|
// Currently only XCR0 is defined by Intel so |xcr| should always be zero.
|
||||||
|
uint64_t xgetbv(uint32_t xcr) {
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
return _xgetbv(xcr);
|
||||||
|
#else
|
||||||
|
uint32_t eax, edx;
|
||||||
|
|
||||||
|
__asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr));
|
||||||
|
return (static_cast<uint64_t>(edx) << 32) | eax;
|
||||||
|
#endif // _MSC_VER
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef _MSC_VER
|
#ifndef _MSC_VER
|
||||||
// Intrinsic for "cpuid".
|
// Intrinsic for "cpuid".
|
||||||
#if defined(__pic__) && defined(__i386__)
|
#if defined(__pic__) && defined(__i386__)
|
||||||
|
@ -41,7 +55,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||||
__asm__ volatile("cpuid\n"
|
__asm__ volatile("cpuid\n"
|
||||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
|
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]),
|
||||||
"=d"(cpu_info[3])
|
"=d"(cpu_info[3])
|
||||||
: "a"(info_type));
|
: "a"(info_type), "c"(0));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#endif // _MSC_VER
|
#endif // _MSC_VER
|
||||||
|
@ -51,6 +65,8 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||||
// Actual feature detection for x86.
|
// Actual feature detection for x86.
|
||||||
static int GetCPUInfo(CPUFeature feature) {
|
static int GetCPUInfo(CPUFeature feature) {
|
||||||
int cpu_info[4];
|
int cpu_info[4];
|
||||||
|
__cpuid(cpu_info, 0);
|
||||||
|
int num_ids = cpu_info[0];
|
||||||
__cpuid(cpu_info, 1);
|
__cpuid(cpu_info, 1);
|
||||||
if (feature == kSSE2) {
|
if (feature == kSSE2) {
|
||||||
return 0 != (cpu_info[3] & 0x04000000);
|
return 0 != (cpu_info[3] & 0x04000000);
|
||||||
|
@ -58,6 +74,23 @@ static int GetCPUInfo(CPUFeature feature) {
|
||||||
if (feature == kSSE3) {
|
if (feature == kSSE3) {
|
||||||
return 0 != (cpu_info[2] & 0x00000001);
|
return 0 != (cpu_info[2] & 0x00000001);
|
||||||
}
|
}
|
||||||
|
if (feature == kAVX2) {
|
||||||
|
// Interpret CPU feature information.
|
||||||
|
int cpu_info7[4] = {-1};
|
||||||
|
if (num_ids >= 7) {
|
||||||
|
__cpuid(cpu_info7, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(WEBRTC_ENABLE_AVX2)
|
||||||
|
return (cpu_info[2] & 0x10000000) != 0 &&
|
||||||
|
(cpu_info[2] & 0x04000000) != 0 /* XSAVE */ &&
|
||||||
|
(cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ &&
|
||||||
|
(xgetbv(0) & 0x00000006) == 6 /* XSAVE enabled by kernel */ &&
|
||||||
|
(cpu_info7[1] & 0x00000020) != 0;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif // WEBRTC_ENABLE_AVX2
|
||||||
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -242,6 +242,10 @@ declare_args() {
|
||||||
# standalone WebRTC.
|
# standalone WebRTC.
|
||||||
rtc_include_internal_audio_device = !build_with_chromium
|
rtc_include_internal_audio_device = !build_with_chromium
|
||||||
|
|
||||||
|
# Set this to true to enable the avx2 support in webrtc.
|
||||||
|
# TODO(bugs.webrtc.org/11663): Default this to true and eventually remove.
|
||||||
|
rtc_enable_avx2 = false
|
||||||
|
|
||||||
# Include tests in standalone checkout.
|
# Include tests in standalone checkout.
|
||||||
rtc_include_tests = !build_with_chromium && !build_with_mozilla
|
rtc_include_tests = !build_with_chromium && !build_with_mozilla
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue