diff --git a/BUILD.gn b/BUILD.gn index b676d1e15e..c844e3853f 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -129,6 +129,10 @@ config("common_inherited_config") { defines += [ "RTC_DISABLE_CHECK_MSG" ] } + if (rtc_enable_avx2) { + defines += [ "WEBRTC_ENABLE_AVX2" ] + } + # Some tests need to declare their own trace event handlers. If this define is # not set, the first time TRACE_EVENT_* is called it will store the return # value for the current handler in an static variable, so that subsequent diff --git a/common_audio/BUILD.gn b/common_audio/BUILD.gn index 4077486d87..fc76351c10 100644 --- a/common_audio/BUILD.gn +++ b/common_audio/BUILD.gn @@ -67,6 +67,7 @@ rtc_library("common_audio") { if (current_cpu == "x86" || current_cpu == "x64") { deps += [ ":common_audio_sse2" ] + deps += [ ":common_audio_avx2" ] } } @@ -235,6 +236,7 @@ rtc_library("fir_filter_factory") { ] if (current_cpu == "x86" || current_cpu == "x64") { deps += [ ":common_audio_sse2" ] + deps += [ ":common_audio_avx2" ] } if (rtc_build_with_neon) { deps += [ ":common_audio_neon" ] @@ -261,6 +263,27 @@ if (current_cpu == "x86" || current_cpu == "x64") { "../rtc_base/memory:aligned_malloc", ] } + + rtc_library("common_audio_avx2") { + sources = [ "resampler/sinc_resampler_avx2.cc" ] + + if (is_win) { + cflags = [ "/arch:AVX2" ] + } else { + cflags = [ + "-mavx2", + "-mfma", + ] + } + + deps = [ + ":fir_filter", + ":sinc_resampler", + "../rtc_base:checks", + "../rtc_base:rtc_base_approved", + "../rtc_base/memory:aligned_malloc", + ] + } } if (rtc_build_with_neon) { diff --git a/common_audio/resampler/sinc_resampler.cc b/common_audio/resampler/sinc_resampler.cc index 21707e9e4e..831ce53d4a 100644 --- a/common_audio/resampler/sinc_resampler.cc +++ b/common_audio/resampler/sinc_resampler.cc @@ -122,28 +122,22 @@ double SincScaleFactor(double io_ratio) { const size_t SincResampler::kKernelSize; // If we know the minimum architecture at compile time, avoid CPU detection. -#if defined(WEBRTC_ARCH_X86_FAMILY) -#if defined(__SSE2__) -#define CONVOLVE_FUNC Convolve_SSE -void SincResampler::InitializeCPUSpecificFeatures() {} -#else -// x86 CPU detection required. Function will be set by -// InitializeCPUSpecificFeatures(). -// TODO(dalecurtis): Once Chrome moves to an SSE baseline this can be removed. -#define CONVOLVE_FUNC convolve_proc_ - void SincResampler::InitializeCPUSpecificFeatures() { - convolve_proc_ = WebRtc_GetCPUInfo(kSSE2) ? Convolve_SSE : Convolve_C; -} -#endif -#elif defined(WEBRTC_HAS_NEON) -#define CONVOLVE_FUNC Convolve_NEON -void SincResampler::InitializeCPUSpecificFeatures() {} +#if defined(WEBRTC_HAS_NEON) + convolve_proc_ = Convolve_NEON; +#elif defined(WEBRTC_ARCH_X86_FAMILY) + // Using AVX2 instead of SSE2 when AVX2 supported. + if (WebRtc_GetCPUInfo(kAVX2)) + convolve_proc_ = Convolve_AVX2; + else if (WebRtc_GetCPUInfo(kSSE2)) + convolve_proc_ = Convolve_SSE; + else + convolve_proc_ = Convolve_C; #else -// Unknown architecture. -#define CONVOLVE_FUNC Convolve_C -void SincResampler::InitializeCPUSpecificFeatures() {} + // Unknown architecture. + convolve_proc_ = Convolve_C; #endif +} SincResampler::SincResampler(double io_sample_rate_ratio, size_t request_frames, @@ -152,24 +146,20 @@ SincResampler::SincResampler(double io_sample_rate_ratio, read_cb_(read_cb), request_frames_(request_frames), input_buffer_size_(request_frames_ + kKernelSize), - // Create input buffers with a 16-byte alignment for SSE optimizations. + // Create input buffers with a 32-byte alignment for SIMD optimizations. kernel_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))), kernel_pre_sinc_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))), kernel_window_storage_(static_cast( - AlignedMalloc(sizeof(float) * kKernelStorageSize, 16))), + AlignedMalloc(sizeof(float) * kKernelStorageSize, 32))), input_buffer_(static_cast( - AlignedMalloc(sizeof(float) * input_buffer_size_, 16))), -#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__) + AlignedMalloc(sizeof(float) * input_buffer_size_, 32))), convolve_proc_(nullptr), -#endif r1_(input_buffer_.get()), r2_(input_buffer_.get() + kKernelSize / 2) { -#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__) InitializeCPUSpecificFeatures(); RTC_DCHECK(convolve_proc_); -#endif RTC_DCHECK_GT(request_frames_, 0); Flush(); RTC_DCHECK_GT(block_size_, kKernelSize); @@ -302,10 +292,10 @@ void SincResampler::Resample(size_t frames, float* destination) { const float* const k1 = kernel_ptr + offset_idx * kKernelSize; const float* const k2 = k1 + kKernelSize; - // Ensure |k1|, |k2| are 16-byte aligned for SIMD usage. Should always be - // true so long as kKernelSize is a multiple of 16. - RTC_DCHECK_EQ(0, reinterpret_cast(k1) % 16); - RTC_DCHECK_EQ(0, reinterpret_cast(k2) % 16); + // Ensure |k1|, |k2| are 32-byte aligned for SIMD usage. Should always be + // true so long as kKernelSize is a multiple of 32. + RTC_DCHECK_EQ(0, reinterpret_cast(k1) % 32); + RTC_DCHECK_EQ(0, reinterpret_cast(k2) % 32); // Initialize input pointer based on quantized |virtual_source_idx_|. const float* const input_ptr = r1_ + source_idx; @@ -314,7 +304,7 @@ void SincResampler::Resample(size_t frames, float* destination) { const double kernel_interpolation_factor = virtual_offset_idx - offset_idx; *destination++ = - CONVOLVE_FUNC(input_ptr, k1, k2, kernel_interpolation_factor); + convolve_proc_(input_ptr, k1, k2, kernel_interpolation_factor); // Advance the virtual index. virtual_source_idx_ += current_io_ratio; diff --git a/common_audio/resampler/sinc_resampler.h b/common_audio/resampler/sinc_resampler.h index 5181c18dac..a72a0c62c4 100644 --- a/common_audio/resampler/sinc_resampler.h +++ b/common_audio/resampler/sinc_resampler.h @@ -112,6 +112,10 @@ class SincResampler { const float* k1, const float* k2, double kernel_interpolation_factor); + static float Convolve_AVX2(const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor); #elif defined(WEBRTC_HAS_NEON) static float Convolve_NEON(const float* input_ptr, const float* k1, @@ -155,13 +159,11 @@ class SincResampler { // TODO(ajm): Move to using a global static which must only be initialized // once by the user. We're not doing this initially, because we don't have // e.g. a LazyInstance helper in webrtc. -#if defined(WEBRTC_ARCH_X86_FAMILY) && !defined(__SSE2__) typedef float (*ConvolveProc)(const float*, const float*, const float*, double); ConvolveProc convolve_proc_; -#endif // Pointers to the various regions inside |input_buffer_|. See the diagram at // the top of the .cc file for more information. diff --git a/common_audio/resampler/sinc_resampler_avx2.cc b/common_audio/resampler/sinc_resampler_avx2.cc new file mode 100644 index 0000000000..3eb5d4a1b1 --- /dev/null +++ b/common_audio/resampler/sinc_resampler_avx2.cc @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include +#include +#include + +#include "common_audio/resampler/sinc_resampler.h" + +namespace webrtc { + +float SincResampler::Convolve_AVX2(const float* input_ptr, + const float* k1, + const float* k2, + double kernel_interpolation_factor) { + __m256 m_input; + __m256 m_sums1 = _mm256_setzero_ps(); + __m256 m_sums2 = _mm256_setzero_ps(); + + // Based on |input_ptr| alignment, we need to use loadu or load. Unrolling + // these loops has not been tested or benchmarked. + bool aligned_input = (reinterpret_cast(input_ptr) & 0x1F) == 0; + if (!aligned_input) { + for (size_t i = 0; i < kKernelSize; i += 8) { + m_input = _mm256_loadu_ps(input_ptr + i); + m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); + m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); + } + } else { + for (size_t i = 0; i < kKernelSize; i += 8) { + m_input = _mm256_load_ps(input_ptr + i); + m_sums1 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k1 + i), m_sums1); + m_sums2 = _mm256_fmadd_ps(m_input, _mm256_load_ps(k2 + i), m_sums2); + } + } + + // Linearly interpolate the two "convolutions". + __m128 m128_sums1 = _mm_add_ps(_mm256_extractf128_ps(m_sums1, 0), + _mm256_extractf128_ps(m_sums1, 1)); + __m128 m128_sums2 = _mm_add_ps(_mm256_extractf128_ps(m_sums2, 0), + _mm256_extractf128_ps(m_sums2, 1)); + m128_sums1 = _mm_mul_ps( + m128_sums1, + _mm_set_ps1(static_cast(1.0 - kernel_interpolation_factor))); + m128_sums2 = _mm_mul_ps( + m128_sums2, _mm_set_ps1(static_cast(kernel_interpolation_factor))); + m128_sums1 = _mm_add_ps(m128_sums1, m128_sums2); + + // Sum components together. + float result; + m128_sums2 = _mm_add_ps(_mm_movehl_ps(m128_sums1, m128_sums1), m128_sums1); + _mm_store_ss(&result, _mm_add_ss(m128_sums2, + _mm_shuffle_ps(m128_sums2, m128_sums2, 1))); + + return result; +} + +} // namespace webrtc diff --git a/common_audio/resampler/sinc_resampler_unittest.cc b/common_audio/resampler/sinc_resampler_unittest.cc index b067b23b88..ece6af0689 100644 --- a/common_audio/resampler/sinc_resampler_unittest.cc +++ b/common_audio/resampler/sinc_resampler_unittest.cc @@ -116,17 +116,9 @@ TEST(SincResamplerTest, DISABLED_SetRatioBench) { printf("SetRatio() took %.2fms.\n", total_time_c_us / 1000); } -// Define platform independent function name for Convolve* tests. -#if defined(WEBRTC_ARCH_X86_FAMILY) -#define CONVOLVE_FUNC Convolve_SSE -#elif defined(WEBRTC_ARCH_ARM_V7) -#define CONVOLVE_FUNC Convolve_NEON -#endif - // Ensure various optimized Convolve() methods return the same value. Only run // this test if other optimized methods exist, otherwise the default Convolve() // will be tested by the parameterized SincResampler tests below. -#if defined(CONVOLVE_FUNC) TEST(SincResamplerTest, Convolve) { #if defined(WEBRTC_ARCH_X86_FAMILY) ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2)); @@ -148,7 +140,7 @@ TEST(SincResamplerTest, Convolve) { double result = resampler.Convolve_C( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); - double result2 = resampler.CONVOLVE_FUNC( + double result2 = resampler.convolve_proc_( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); EXPECT_NEAR(result2, result, kEpsilon); @@ -157,12 +149,11 @@ TEST(SincResamplerTest, Convolve) { result = resampler.Convolve_C( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); - result2 = resampler.CONVOLVE_FUNC( + result2 = resampler.convolve_proc_( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); EXPECT_NEAR(result2, result, kEpsilon); } -#endif // Benchmark for the various Convolve() methods. Make sure to build with // branding=Chrome so that RTC_DCHECKs are compiled out when benchmarking. @@ -190,7 +181,6 @@ TEST(SincResamplerTest, ConvolveBenchmark) { (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec; printf("Convolve_C took %.2fms.\n", total_time_c_us / 1000); -#if defined(CONVOLVE_FUNC) #if defined(WEBRTC_ARCH_X86_FAMILY) ASSERT_TRUE(WebRtc_GetCPUInfo(kSSE2)); #elif defined(WEBRTC_ARCH_ARM_V7) @@ -200,36 +190,33 @@ TEST(SincResamplerTest, ConvolveBenchmark) { // Benchmark with unaligned input pointer. start = rtc::TimeNanos(); for (int j = 0; j < kConvolveIterations; ++j) { - resampler.CONVOLVE_FUNC( + resampler.convolve_proc_( resampler.kernel_storage_.get() + 1, resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); } double total_time_optimized_unaligned_us = (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec; - printf(STRINGIZE(CONVOLVE_FUNC) "(unaligned) took %.2fms; which is %.2fx " + printf(STRINGIZE(convolve_proc_) "(unaligned) took %.2fms; which is %.2fx " "faster than Convolve_C.\n", total_time_optimized_unaligned_us / 1000, total_time_c_us / total_time_optimized_unaligned_us); // Benchmark with aligned input pointer. start = rtc::TimeNanos(); for (int j = 0; j < kConvolveIterations; ++j) { - resampler.CONVOLVE_FUNC( + resampler.convolve_proc_( resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), resampler.kernel_storage_.get(), kKernelInterpolationFactor); } double total_time_optimized_aligned_us = (rtc::TimeNanos() - start) / rtc::kNumNanosecsPerMicrosec; - printf(STRINGIZE(CONVOLVE_FUNC) " (aligned) took %.2fms; which is %.2fx " + printf(STRINGIZE(convolve_proc_) " (aligned) took %.2fms; which is %.2fx " "faster than Convolve_C and %.2fx faster than " - STRINGIZE(CONVOLVE_FUNC) " (unaligned).\n", + STRINGIZE(convolve_proc_) " (unaligned).\n", total_time_optimized_aligned_us / 1000, total_time_c_us / total_time_optimized_aligned_us, total_time_optimized_unaligned_us / total_time_optimized_aligned_us); -#endif } -#undef CONVOLVE_FUNC - typedef std::tuple SincResamplerTestData; class SincResamplerTest : public ::testing::TestWithParam { @@ -352,7 +339,7 @@ INSTANTIATE_TEST_SUITE_P( std::make_tuple(16000, 44100, kResamplingRMSError, -62.54), std::make_tuple(22050, 44100, kResamplingRMSError, -73.53), std::make_tuple(32000, 44100, kResamplingRMSError, -63.32), - std::make_tuple(44100, 44100, kResamplingRMSError, -73.53), + std::make_tuple(44100, 44100, kResamplingRMSError, -73.52), std::make_tuple(48000, 44100, -15.01, -64.04), std::make_tuple(96000, 44100, -18.49, -25.51), std::make_tuple(192000, 44100, -20.50, -13.31), @@ -360,7 +347,7 @@ INSTANTIATE_TEST_SUITE_P( // To 48kHz std::make_tuple(8000, 48000, kResamplingRMSError, -63.43), std::make_tuple(11025, 48000, kResamplingRMSError, -62.61), - std::make_tuple(16000, 48000, kResamplingRMSError, -63.96), + std::make_tuple(16000, 48000, kResamplingRMSError, -63.95), std::make_tuple(22050, 48000, kResamplingRMSError, -62.42), std::make_tuple(32000, 48000, kResamplingRMSError, -64.04), std::make_tuple(44100, 48000, kResamplingRMSError, -62.63), diff --git a/system_wrappers/include/cpu_features_wrapper.h b/system_wrappers/include/cpu_features_wrapper.h index 739161afca..02d54b4516 100644 --- a/system_wrappers/include/cpu_features_wrapper.h +++ b/system_wrappers/include/cpu_features_wrapper.h @@ -18,7 +18,7 @@ extern "C" { #endif // List of features in x86. -typedef enum { kSSE2, kSSE3 } CPUFeature; +typedef enum { kSSE2, kSSE3, kAVX2 } CPUFeature; // List of features in ARM. enum { diff --git a/system_wrappers/source/cpu_features.cc b/system_wrappers/source/cpu_features.cc index ebcb48c15f..1667e46c10 100644 --- a/system_wrappers/source/cpu_features.cc +++ b/system_wrappers/source/cpu_features.cc @@ -24,6 +24,20 @@ int GetCPUInfoNoASM(CPUFeature feature) { } #if defined(WEBRTC_ARCH_X86_FAMILY) + +// xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so |xcr| should always be zero. +uint64_t xgetbv(uint32_t xcr) { +#if defined(_MSC_VER) + return _xgetbv(xcr); +#else + uint32_t eax, edx; + + __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (static_cast(edx) << 32) | eax; +#endif // _MSC_VER +} + #ifndef _MSC_VER // Intrinsic for "cpuid". #if defined(__pic__) && defined(__i386__) @@ -41,7 +55,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) { __asm__ volatile("cpuid\n" : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type)); + : "a"(info_type), "c"(0)); } #endif #endif // _MSC_VER @@ -51,6 +65,8 @@ static inline void __cpuid(int cpu_info[4], int info_type) { // Actual feature detection for x86. static int GetCPUInfo(CPUFeature feature) { int cpu_info[4]; + __cpuid(cpu_info, 0); + int num_ids = cpu_info[0]; __cpuid(cpu_info, 1); if (feature == kSSE2) { return 0 != (cpu_info[3] & 0x04000000); @@ -58,6 +74,23 @@ static int GetCPUInfo(CPUFeature feature) { if (feature == kSSE3) { return 0 != (cpu_info[2] & 0x00000001); } + if (feature == kAVX2) { + // Interpret CPU feature information. + int cpu_info7[4] = {-1}; + if (num_ids >= 7) { + __cpuid(cpu_info7, 7); + } + +#if defined(WEBRTC_ENABLE_AVX2) + return (cpu_info[2] & 0x10000000) != 0 && + (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && + (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && + (xgetbv(0) & 0x00000006) == 6 /* XSAVE enabled by kernel */ && + (cpu_info7[1] & 0x00000020) != 0; +#else + return 0; +#endif // WEBRTC_ENABLE_AVX2 + } return 0; } #else diff --git a/webrtc.gni b/webrtc.gni index 100c5851a5..95154709fe 100644 --- a/webrtc.gni +++ b/webrtc.gni @@ -242,6 +242,10 @@ declare_args() { # standalone WebRTC. rtc_include_internal_audio_device = !build_with_chromium + # Set this to true to enable the avx2 support in webrtc. + # TODO(bugs.webrtc.org/11663): Default this to true and eventually remove. + rtc_enable_avx2 = false + # Include tests in standalone checkout. rtc_include_tests = !build_with_chromium && !build_with_mozilla