From 54d1344d985b00d4d1580dd18057d4618c11ad1f Mon Sep 17 00:00:00 2001 From: Alessio Bazzica Date: Sat, 12 Feb 2022 08:11:51 +0000 Subject: [PATCH] Reland "Remove unused APM voice activity detection sub-module" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit a751f167c68343f76528436defdbc61600a8d7b3. Reason for revert: dependency in a downstream project removed Original change's description: > Revert "Remove unused APM voice activity detection sub-module" > > This reverts commit b4e06d032e6f82a65c52ed0c5364ae9e7c0a0215. > > Reason for revert: breaking downstream projects > > Original change's description: > > Remove unused APM voice activity detection sub-module > > > > API changes: > > - webrtc::AudioProcessing::Config::VoiceDetection removed > > - webrtc::AudioProcessingStats::voice_detected deprecated > > - cricket::AudioOptions::typing_detection deprecated > > - webrtc::StatsReport::StatsValueName:: > > kStatsValueNameTypingNoiseState deprecated > > > > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0 > > > > Bug: webrtc:11226,webrtc:11292 > > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666 > > Reviewed-by: Harald Alvestrand > > Reviewed-by: Gustaf Ullberg > > Reviewed-by: Sam Zackrisson > > Reviewed-by: Björn Terelius > > Commit-Queue: Alessio Bazzica > > Cr-Commit-Position: refs/heads/main@{#35975} > > TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com > > Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2 > No-Presubmit: true > No-Tree-Checks: true > No-Try: true > Bug: webrtc:11226,webrtc:11292 > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600 > Reviewed-by: Harald Alvestrand > Reviewed-by: Gustaf Ullberg > Commit-Queue: Mirko Bonadei > Cr-Commit-Position: refs/heads/main@{#35977} # Not skipping CQ checks because this is a reland. Bug: webrtc:11226,webrtc:11292 Change-Id: I2fcbc5fdade16bfe6a0f0a02841a33a598d4f2ad Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251660 Reviewed-by: Alessio Bazzica Reviewed-by: Harald Alvestrand Commit-Queue: Alessio Bazzica Cr-Commit-Position: refs/heads/main@{#35984} --- api/audio_options.h | 2 + api/stats_types.cc | 1 + api/stats_types.h | 1 + audio/audio_transport_impl.cc | 22 ---- audio/audio_transport_impl.h | 5 +- media/engine/webrtc_voice_engine.cc | 4 +- media/engine/webrtc_voice_engine_unittest.cc | 39 ------- modules/audio_processing/BUILD.gn | 17 --- .../audio_processing/audio_processing_impl.cc | 30 +---- .../audio_processing/audio_processing_impl.h | 5 - .../audio_processing_impl_locking_unittest.cc | 1 - .../audio_processing_performance_unittest.cc | 3 - .../audio_processing_unittest.cc | 92 +--------------- .../include/audio_processing.cc | 1 - .../include/audio_processing.h | 7 -- .../include/audio_processing_statistics.h | 2 + .../test/audio_processing_simulator.cc | 4 - .../test/audio_processing_simulator.h | 1 - .../test/audioproc_float_impl.cc | 6 - modules/audio_processing/voice_detection.cc | 92 ---------------- modules/audio_processing/voice_detection.h | 59 ---------- .../voice_detection_unittest.cc | 104 ------------------ .../audio_processing_configs_fuzzer.cc | 3 +- 23 files changed, 18 insertions(+), 483 deletions(-) delete mode 100644 modules/audio_processing/voice_detection.cc delete mode 100644 modules/audio_processing/voice_detection.h delete mode 100644 modules/audio_processing/voice_detection_unittest.cc diff --git a/api/audio_options.h b/api/audio_options.h index 48dd628ecd..16aa9e450d 100644 --- a/api/audio_options.h +++ b/api/audio_options.h @@ -60,6 +60,8 @@ struct RTC_EXPORT AudioOptions { absl::optional audio_jitter_buffer_min_delay_ms; // Audio receiver jitter buffer (NetEq) should handle retransmitted packets. absl::optional audio_jitter_buffer_enable_rtx_handling; + // Deprecated. + // TODO(bugs.webrtc.org/11226): Remove. // Audio processing to detect typing. absl::optional typing_detection; absl::optional experimental_agc; diff --git a/api/stats_types.cc b/api/stats_types.cc index 1090643f1c..b044e4ab11 100644 --- a/api/stats_types.cc +++ b/api/stats_types.cc @@ -648,6 +648,7 @@ const char* StatsReport::Value::display_name() const { return "googTrackId"; case kStatsValueNameTimingFrameInfo: return "googTimingFrameInfo"; + // TODO(bugs.webrtc.org/11226): Remove. case kStatsValueNameTypingNoiseState: return "googTypingNoiseState"; case kStatsValueNameWritable: diff --git a/api/stats_types.h b/api/stats_types.h index c3e4451ef6..e7dd528e62 100644 --- a/api/stats_types.h +++ b/api/stats_types.h @@ -235,6 +235,7 @@ class RTC_EXPORT StatsReport { kStatsValueNameTrackId, kStatsValueNameTransmitBitrate, kStatsValueNameTransportType, + // TODO(bugs.webrtc.org/11226): Remove. kStatsValueNameTypingNoiseState, kStatsValueNameWritable, kStatsValueNameAudioDeviceUnderrunCounter, diff --git a/audio/audio_transport_impl.cc b/audio/audio_transport_impl.cc index a5c952f8bc..194f09cf6c 100644 --- a/audio/audio_transport_impl.cc +++ b/audio/audio_transport_impl.cc @@ -165,24 +165,6 @@ int32_t AudioTransportImpl::RecordedDataIsAvailable( audio_frame.get()); audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns / 1000000); - // Typing detection (utilizes the APM/VAD decision). We let the VAD determine - // if we're using this feature or not. - // TODO(solenberg): GetConfig() takes a lock. Work around that. - bool typing_detected = false; - if (audio_processing_ && - audio_processing_->GetConfig().voice_detection.enabled) { - if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) { - bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive; - typing_detected = typing_detection_.Process(key_pressed, vad_active); - } - } - - // Copy frame and push to each sending stream. The copy is required since an - // encoding task will be posted internally to each stream. - { - MutexLock lock(&capture_lock_); - typing_noise_detected_ = typing_detected; - } RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0); if (async_audio_processing_) @@ -290,8 +272,4 @@ void AudioTransportImpl::SetStereoChannelSwapping(bool enable) { swap_stereo_channels_ = enable; } -bool AudioTransportImpl::typing_noise_detected() const { - MutexLock lock(&capture_lock_); - return typing_noise_detected_; -} } // namespace webrtc diff --git a/audio/audio_transport_impl.h b/audio/audio_transport_impl.h index 0b1406f680..89999560c6 100644 --- a/audio/audio_transport_impl.h +++ b/audio/audio_transport_impl.h @@ -86,7 +86,9 @@ class AudioTransportImpl : public AudioTransport { int send_sample_rate_hz, size_t send_num_channels); void SetStereoChannelSwapping(bool enable); - bool typing_noise_detected() const; + // Deprecated. + // TODO(bugs.webrtc.org/11226): Remove. + bool typing_noise_detected() const { return false; } private: void SendProcessedData(std::unique_ptr audio_frame); @@ -103,7 +105,6 @@ class AudioTransportImpl : public AudioTransport { std::vector audio_senders_ RTC_GUARDED_BY(capture_lock_); int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000; size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1; - bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false; bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false; PushResampler capture_resampler_; TypingDetection typing_detection_; diff --git a/media/engine/webrtc_voice_engine.cc b/media/engine/webrtc_voice_engine.cc index 06400014b6..b7b0ad78d6 100644 --- a/media/engine/webrtc_voice_engine.cc +++ b/media/engine/webrtc_voice_engine.cc @@ -634,9 +634,7 @@ bool WebRtcVoiceEngine::ApplyOptions(const AudioOptions& options_in) { } if (options.typing_detection) { - RTC_LOG(LS_INFO) << "Typing detection is enabled? " - << *options.typing_detection; - apm_config.voice_detection.enabled = *options.typing_detection; + RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported."; } ap->ApplyConfig(apm_config); diff --git a/media/engine/webrtc_voice_engine_unittest.cc b/media/engine/webrtc_voice_engine_unittest.cc index 8d864ae966..40d5714253 100644 --- a/media/engine/webrtc_voice_engine_unittest.cc +++ b/media/engine/webrtc_voice_engine_unittest.cc @@ -221,11 +221,6 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam { // Default Options. VerifyEchoCancellationSettings(/*enabled=*/true); EXPECT_TRUE(IsHighPassFilterEnabled()); -#if defined(WEBRTC_ANDROID) - EXPECT_FALSE(IsTypingDetectionEnabled()); -#else - EXPECT_TRUE(IsTypingDetectionEnabled()); -#endif EXPECT_TRUE(apm_config_.noise_suppression.enabled); EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel); VerifyGainControlEnabledCorrectly(); @@ -793,10 +788,6 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam { return apm_config_.high_pass_filter.enabled; } - bool IsTypingDetectionEnabled() { - return apm_config_.voice_detection.enabled; - } - protected: const bool use_null_apm_; std::unique_ptr task_queue_factory_; @@ -3041,40 +3032,10 @@ TEST_P(WebRtcVoiceEngineTestFake, SetAudioOptions) { if (!use_null_apm_) { VerifyEchoCancellationSettings(/*enabled=*/true); EXPECT_TRUE(IsHighPassFilterEnabled()); -#if defined(WEBRTC_ANDROID) - EXPECT_FALSE(IsTypingDetectionEnabled()); -#else - EXPECT_TRUE(IsTypingDetectionEnabled()); -#endif } EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets); EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate); - // Turn typing detection off. - send_parameters_.options.typing_detection = false; - SetSendParameters(send_parameters_); - if (!use_null_apm_) { - EXPECT_FALSE(IsTypingDetectionEnabled()); - } - - // Leave typing detection unchanged, but non-default. - send_parameters_.options.typing_detection = absl::nullopt; - SetSendParameters(send_parameters_); - if (!use_null_apm_) { - EXPECT_FALSE(IsTypingDetectionEnabled()); - } - - // Turn typing detection on. - send_parameters_.options.typing_detection = true; - SetSendParameters(send_parameters_); - if (!use_null_apm_) { -#if defined(WEBRTC_ANDROID) - EXPECT_FALSE(IsTypingDetectionEnabled()); -#else - EXPECT_TRUE(IsTypingDetectionEnabled()); -#endif - } - // Turn echo cancellation off send_parameters_.options.echo_cancellation = false; SetSendParameters(send_parameters_); diff --git a/modules/audio_processing/BUILD.gn b/modules/audio_processing/BUILD.gn index f32058d62a..ee6b579617 100644 --- a/modules/audio_processing/BUILD.gn +++ b/modules/audio_processing/BUILD.gn @@ -168,7 +168,6 @@ rtc_library("audio_processing") { ":high_pass_filter", ":optionally_built_submodule_creators", ":rms_level", - ":voice_detection", "../../api:array_view", "../../api:function_view", "../../api/audio:aec3_config", @@ -218,20 +217,6 @@ rtc_library("audio_processing") { } } -rtc_library("voice_detection") { - sources = [ - "voice_detection.cc", - "voice_detection.h", - ] - deps = [ - ":api", - ":audio_buffer", - "../../api/audio:audio_frame_api", - "../../common_audio:common_audio_c", - "../../rtc_base:checks", - ] -} - rtc_library("residual_echo_detector") { poisonous = [ "default_echo_detector" ] configs += [ ":apm_debug_dump" ] @@ -379,7 +364,6 @@ if (rtc_include_tests) { ":gain_controller2", ":high_pass_filter", ":mocks", - ":voice_detection", "../../api:array_view", "../../api:scoped_refptr", "../../api/audio:aec3_config", @@ -474,7 +458,6 @@ if (rtc_include_tests) { "test/echo_canceller_test_tools_unittest.cc", "test/echo_control_mock.h", "test/test_utils.h", - "voice_detection_unittest.cc", ] } } diff --git a/modules/audio_processing/audio_processing_impl.cc b/modules/audio_processing/audio_processing_impl.cc index 8810efeddb..9a1aaee821 100644 --- a/modules/audio_processing/audio_processing_impl.cc +++ b/modules/audio_processing/audio_processing_impl.cc @@ -141,7 +141,6 @@ bool AudioProcessingImpl::SubmoduleStates::Update( bool gain_controller2_enabled, bool gain_adjustment_enabled, bool echo_controller_enabled, - bool voice_detector_enabled, bool transient_suppressor_enabled) { bool changed = false; changed |= (high_pass_filter_enabled != high_pass_filter_enabled_); @@ -153,7 +152,6 @@ bool AudioProcessingImpl::SubmoduleStates::Update( changed |= (gain_controller2_enabled != gain_controller2_enabled_); changed |= (gain_adjustment_enabled != gain_adjustment_enabled_); changed |= (echo_controller_enabled != echo_controller_enabled_); - changed |= (voice_detector_enabled != voice_detector_enabled_); changed |= (transient_suppressor_enabled != transient_suppressor_enabled_); if (changed) { high_pass_filter_enabled_ = high_pass_filter_enabled; @@ -163,7 +161,6 @@ bool AudioProcessingImpl::SubmoduleStates::Update( gain_controller2_enabled_ = gain_controller2_enabled; gain_adjustment_enabled_ = gain_adjustment_enabled; echo_controller_enabled_ = echo_controller_enabled; - voice_detector_enabled_ = voice_detector_enabled; transient_suppressor_enabled_ = transient_suppressor_enabled; } @@ -174,7 +171,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update( bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive() const { - return CaptureMultiBandProcessingPresent() || voice_detector_enabled_; + return CaptureMultiBandProcessingPresent(); } bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent() @@ -371,7 +368,6 @@ void AudioProcessingImpl::InitializeLocked() { InitializeGainController1(); InitializeTransientSuppressor(); InitializeHighPassFilter(true); - InitializeVoiceDetector(); InitializeResidualEchoDetector(); InitializeEchoController(); InitializeGainController2(/*config_has_changed=*/true); @@ -506,9 +502,6 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) { const bool agc2_config_changed = config_.gain_controller2 != config.gain_controller2; - const bool voice_detection_config_changed = - config_.voice_detection.enabled != config.voice_detection.enabled; - const bool ns_config_changed = config_.noise_suppression.enabled != config.noise_suppression.enabled || config_.noise_suppression.level != config.noise_suppression.level; @@ -557,10 +550,6 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) { InitializeCaptureLevelsAdjuster(); } - if (voice_detection_config_changed) { - InitializeVoiceDetector(); - } - // Reinitialization must happen after all submodule configuration to avoid // additional reinitializations on the next capture / render processing call. if (pipeline_config_changed) { @@ -1215,13 +1204,6 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() { } } - if (config_.voice_detection.enabled) { - capture_.stats.voice_detected = - submodules_.voice_detector->ProcessCaptureAudio(capture_buffer); - } else { - capture_.stats.voice_detected = absl::nullopt; - } - if (submodules_.agc_manager) { submodules_.agc_manager->Process(capture_buffer); @@ -1682,7 +1664,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() { !!submodules_.gain_controller2, config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled, capture_nonlocked_.echo_controller_enabled, - config_.voice_detection.enabled, !!submodules_.transient_suppressor); + !!submodules_.transient_suppressor); } void AudioProcessingImpl::InitializeTransientSuppressor() { @@ -1732,14 +1714,6 @@ void AudioProcessingImpl::InitializeHighPassFilter(bool forced_reset) { } } -void AudioProcessingImpl::InitializeVoiceDetector() { - if (config_.voice_detection.enabled) { - submodules_.voice_detector = std::make_unique( - proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood); - } else { - submodules_.voice_detector.reset(); - } -} void AudioProcessingImpl::InitializeEchoController() { bool use_echo_controller = echo_control_factory_ || diff --git a/modules/audio_processing/audio_processing_impl.h b/modules/audio_processing/audio_processing_impl.h index 47dd62ed02..344b8c5959 100644 --- a/modules/audio_processing/audio_processing_impl.h +++ b/modules/audio_processing/audio_processing_impl.h @@ -39,7 +39,6 @@ #include "modules/audio_processing/render_queue_item_verifier.h" #include "modules/audio_processing/rms_level.h" #include "modules/audio_processing/transient/transient_suppressor.h" -#include "modules/audio_processing/voice_detection.h" #include "rtc_base/gtest_prod_util.h" #include "rtc_base/ignore_wundef.h" #include "rtc_base/swap_queue.h" @@ -208,7 +207,6 @@ class AudioProcessingImpl : public AudioProcessing { bool gain_controller2_enabled, bool gain_adjustment_enabled, bool echo_controller_enabled, - bool voice_detector_enabled, bool transient_suppressor_enabled); bool CaptureMultiBandSubModulesActive() const; bool CaptureMultiBandProcessingPresent() const; @@ -231,7 +229,6 @@ class AudioProcessingImpl : public AudioProcessing { bool gain_controller2_enabled_ = false; bool gain_adjustment_enabled_ = false; bool echo_controller_enabled_ = false; - bool voice_detector_enabled_ = false; bool transient_suppressor_enabled_ = false; bool first_update_ = true; }; @@ -267,7 +264,6 @@ class AudioProcessingImpl : public AudioProcessing { // already acquired. void InitializeHighPassFilter(bool forced_reset) RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); - void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); void InitializeTransientSuppressor() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_); @@ -400,7 +396,6 @@ class AudioProcessingImpl : public AudioProcessing { std::unique_ptr echo_control_mobile; std::unique_ptr noise_suppressor; std::unique_ptr transient_suppressor; - std::unique_ptr voice_detector; std::unique_ptr capture_levels_adjuster; } submodules_; diff --git a/modules/audio_processing/audio_processing_impl_locking_unittest.cc b/modules/audio_processing/audio_processing_impl_locking_unittest.cc index 343f077464..7557e919d6 100644 --- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc +++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc @@ -483,7 +483,6 @@ AudioProcessing::Config GetApmTestConfig(AecType aec_type) { apm_config.gain_controller1.mode = AudioProcessing::Config::GainController1::kAdaptiveDigital; apm_config.noise_suppression.enabled = true; - apm_config.voice_detection.enabled = true; return apm_config; } diff --git a/modules/audio_processing/audio_processing_performance_unittest.cc b/modules/audio_processing/audio_processing_performance_unittest.cc index c885293a4f..57655aea6d 100644 --- a/modules/audio_processing/audio_processing_performance_unittest.cc +++ b/modules/audio_processing/audio_processing_performance_unittest.cc @@ -441,7 +441,6 @@ class CallSimulator : public ::testing::TestWithParam { apm_config.gain_controller1.enabled = true; apm_config.gain_controller1.mode = AudioProcessing::Config::GainController1::kAdaptiveDigital; - apm_config.voice_detection.enabled = true; apm->ApplyConfig(apm_config); }; @@ -453,7 +452,6 @@ class CallSimulator : public ::testing::TestWithParam { apm_config.noise_suppression.enabled = true; apm_config.gain_controller1.mode = AudioProcessing::Config::GainController1::kAdaptiveDigital; - apm_config.voice_detection.enabled = true; apm->ApplyConfig(apm_config); }; @@ -464,7 +462,6 @@ class CallSimulator : public ::testing::TestWithParam { apm_config.echo_canceller.enabled = false; apm_config.gain_controller1.enabled = false; apm_config.noise_suppression.enabled = false; - apm_config.voice_detection.enabled = false; apm->ApplyConfig(apm_config); }; diff --git a/modules/audio_processing/audio_processing_unittest.cc b/modules/audio_processing/audio_processing_unittest.cc index 96e2d846d9..b21a0227c5 100644 --- a/modules/audio_processing/audio_processing_unittest.cc +++ b/modules/audio_processing/audio_processing_unittest.cc @@ -190,7 +190,6 @@ void EnableAllAPComponents(AudioProcessing* ap) { apm_config.noise_suppression.enabled = true; apm_config.high_pass_filter.enabled = true; - apm_config.voice_detection.enabled = true; apm_config.pipeline.maximum_internal_processing_rate = 48000; ap->ApplyConfig(apm_config); } @@ -1226,7 +1225,6 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) { EXPECT_FALSE(config.high_pass_filter.enabled); EXPECT_FALSE(config.gain_controller1.enabled); EXPECT_FALSE(config.noise_suppression.enabled); - EXPECT_FALSE(config.voice_detection.enabled); } TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) { @@ -1367,48 +1365,6 @@ TEST_F(ApmTest, SplittingFilter) { EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy)); apm_->ApplyConfig(apm_config); - // 3. Only GetStatistics-reporting VAD is enabled... - SetFrameTo(&frame_, 1000); - frame_copy.CopyFrom(frame_); - apm_config.voice_detection.enabled = true; - apm_->ApplyConfig(apm_config); - EXPECT_EQ(apm_->kNoError, - apm_->ProcessStream( - frame_.data.data(), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - frame_.data.data())); - EXPECT_EQ(apm_->kNoError, - apm_->ProcessStream( - frame_.data.data(), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - frame_.data.data())); - EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy)); - apm_config.voice_detection.enabled = false; - apm_->ApplyConfig(apm_config); - - // 4. The VAD is enabled... - SetFrameTo(&frame_, 1000); - frame_copy.CopyFrom(frame_); - apm_config.voice_detection.enabled = true; - apm_->ApplyConfig(apm_config); - EXPECT_EQ(apm_->kNoError, - apm_->ProcessStream( - frame_.data.data(), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - frame_.data.data())); - EXPECT_EQ(apm_->kNoError, - apm_->ProcessStream( - frame_.data.data(), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - StreamConfig(frame_.sample_rate_hz, frame_.num_channels), - frame_.data.data())); - EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy)); - apm_config.voice_detection.enabled = false; - apm_->ApplyConfig(apm_config); - // Check the test is valid. We should have distortion from the filter // when AEC is enabled (which won't affect the audio). apm_config.echo_canceller.enabled = true; @@ -1736,7 +1692,6 @@ TEST_F(ApmTest, Process) { static_cast(test->num_reverse_channels()), true); int frame_count = 0; - int has_voice_count = 0; int analog_level = 127; int analog_level_average = 0; int max_output_average = 0; @@ -1772,8 +1727,6 @@ TEST_F(ApmTest, Process) { analog_level = apm_->recommended_stream_analog_level(); analog_level_average += analog_level; AudioProcessingStats stats = apm_->GetStatistics(); - EXPECT_TRUE(stats.voice_detected); - has_voice_count += *stats.voice_detected ? 1 : 0; size_t frame_size = frame_.samples_per_channel * frame_.num_channels; size_t write_count = @@ -1829,33 +1782,23 @@ TEST_F(ApmTest, Process) { if (!absl::GetFlag(FLAGS_write_apm_ref_data)) { const int kIntNear = 1; - // When running the test on a N7 we get a {2, 6} difference of - // `has_voice_count` and `max_output_average` is up to 18 higher. - // All numbers being consistently higher on N7 compare to ref_data. + // All numbers being consistently higher on N7 compare to the reference + // data. // TODO(bjornv): If we start getting more of these offsets on Android we // should consider a different approach. Either using one slack for all, // or generate a separate android reference. #if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS) - const int kHasVoiceCountOffset = 3; - const int kHasVoiceCountNear = 8; const int kMaxOutputAverageOffset = 9; const int kMaxOutputAverageNear = 26; #else - const int kHasVoiceCountOffset = 0; - const int kHasVoiceCountNear = kIntNear; const int kMaxOutputAverageOffset = 0; const int kMaxOutputAverageNear = kIntNear; #endif - EXPECT_NEAR(test->has_voice_count(), - has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear); - EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear); EXPECT_NEAR(test->max_output_average(), max_output_average - kMaxOutputAverageOffset, kMaxOutputAverageNear); } else { - test->set_has_voice_count(has_voice_count); - test->set_analog_level_average(analog_level_average); test->set_max_output_average(max_output_average); } @@ -2685,7 +2628,6 @@ rtc::scoped_refptr CreateApm(bool mobile_aec) { apm_config.echo_canceller.enabled = true; apm_config.echo_canceller.mobile_mode = mobile_aec; apm_config.noise_suppression.enabled = false; - apm_config.voice_detection.enabled = false; apm->ApplyConfig(apm_config); return apm; } @@ -2794,10 +2736,9 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) { EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value()); } -TEST(ApmStatistics, ReportHasVoice) { +TEST(ApmStatistics, DoNotReportVoiceDetectedStat) { ProcessingConfig processing_config = { {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}}; - AudioProcessing::Config config; // Set up an audioframe. Int16FrameData frame; @@ -2814,37 +2755,14 @@ TEST(ApmStatistics, ReportHasVoice) { AudioProcessingBuilderForTesting().Create(); apm->Initialize(processing_config); - // If not enabled, no metric should be reported. + // No metric should be reported. EXPECT_EQ( apm->ProcessStream(frame.data.data(), StreamConfig(frame.sample_rate_hz, frame.num_channels), StreamConfig(frame.sample_rate_hz, frame.num_channels), frame.data.data()), 0); - EXPECT_FALSE(apm->GetStatistics().voice_detected); - - // If enabled, metrics should be reported. - config.voice_detection.enabled = true; - apm->ApplyConfig(config); - EXPECT_EQ( - apm->ProcessStream(frame.data.data(), - StreamConfig(frame.sample_rate_hz, frame.num_channels), - StreamConfig(frame.sample_rate_hz, frame.num_channels), - frame.data.data()), - 0); - auto stats = apm->GetStatistics(); - EXPECT_TRUE(stats.voice_detected); - - // If re-disabled, the value is again not reported. - config.voice_detection.enabled = false; - apm->ApplyConfig(config); - EXPECT_EQ( - apm->ProcessStream(frame.data.data(), - StreamConfig(frame.sample_rate_hz, frame.num_channels), - StreamConfig(frame.sample_rate_hz, frame.num_channels), - frame.data.data()), - 0); - EXPECT_FALSE(apm->GetStatistics().voice_detected); + EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value()); } TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) { diff --git a/modules/audio_processing/include/audio_processing.cc b/modules/audio_processing/include/audio_processing.cc index 9643b6ca0b..86edaee087 100644 --- a/modules/audio_processing/include/audio_processing.cc +++ b/modules/audio_processing/include/audio_processing.cc @@ -145,7 +145,6 @@ std::string AudioProcessing::Config::ToString() const { << NoiseSuppressionLevelToString(noise_suppression.level) << " }, transient_suppression: { enabled: " << transient_suppression.enabled - << " }, voice_detection: { enabled: " << voice_detection.enabled << " }, gain_controller1: { enabled: " << gain_controller1.enabled << ", mode: " << GainController1ModeToString(gain_controller1.mode) << ", target_level_dbfs: " << gain_controller1.target_level_dbfs diff --git a/modules/audio_processing/include/audio_processing.h b/modules/audio_processing/include/audio_processing.h index 8af5013e94..9d6824c038 100644 --- a/modules/audio_processing/include/audio_processing.h +++ b/modules/audio_processing/include/audio_processing.h @@ -113,8 +113,6 @@ static constexpr int kClippedLevelMin = 70; // // config.high_pass_filter.enabled = true; // -// config.voice_detection.enabled = true; -// // apm->ApplyConfig(config) // // apm->noise_reduction()->set_level(kHighSuppression); @@ -232,11 +230,6 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface { bool enabled = false; } transient_suppression; - // Enables reporting of `voice_detected` in webrtc::AudioProcessingStats. - struct VoiceDetection { - bool enabled = false; - } voice_detection; - // Enables automatic gain control (AGC) functionality. // The automatic gain control (AGC) component brings the signal to an // appropriate range. This is done by applying a digital gain directly and, diff --git a/modules/audio_processing/include/audio_processing_statistics.h b/modules/audio_processing/include/audio_processing_statistics.h index a31dafe49c..3b43319951 100644 --- a/modules/audio_processing/include/audio_processing_statistics.h +++ b/modules/audio_processing/include/audio_processing_statistics.h @@ -24,6 +24,8 @@ struct RTC_EXPORT AudioProcessingStats { AudioProcessingStats(const AudioProcessingStats& other); ~AudioProcessingStats(); + // Deprecated. + // TODO(bugs.webrtc.org/11226): Remove. // True if voice is detected in the last capture frame, after processing. // It is conservative in flagging audio as speech, with low likelihood of // incorrectly flagging a frame as voice. diff --git a/modules/audio_processing/test/audio_processing_simulator.cc b/modules/audio_processing/test/audio_processing_simulator.cc index b1edda18d6..4915648fa9 100644 --- a/modules/audio_processing/test/audio_processing_simulator.cc +++ b/modules/audio_processing/test/audio_processing_simulator.cc @@ -543,10 +543,6 @@ void AudioProcessingSimulator::ConfigureAudioProcessor() { apm_config.high_pass_filter.enabled = *settings_.use_hpf; } - if (settings_.use_vad) { - apm_config.voice_detection.enabled = *settings_.use_vad; - } - if (settings_.use_agc) { apm_config.gain_controller1.enabled = *settings_.use_agc; } diff --git a/modules/audio_processing/test/audio_processing_simulator.h b/modules/audio_processing/test/audio_processing_simulator.h index ae3cd4fbe5..af76d7e1c9 100644 --- a/modules/audio_processing/test/audio_processing_simulator.h +++ b/modules/audio_processing/test/audio_processing_simulator.h @@ -105,7 +105,6 @@ struct SimulationSettings { absl::optional use_ns; absl::optional use_ts; absl::optional use_analog_agc; - absl::optional use_vad; absl::optional use_all; absl::optional analog_agc_disable_digital_adaptive; absl::optional agc_mode; diff --git a/modules/audio_processing/test/audioproc_float_impl.cc b/modules/audio_processing/test/audioproc_float_impl.cc index d4697e4493..aab1881913 100644 --- a/modules/audio_processing/test/audioproc_float_impl.cc +++ b/modules/audio_processing/test/audioproc_float_impl.cc @@ -117,10 +117,6 @@ ABSL_FLAG(int, analog_agc, kParameterNotSpecifiedValue, "Activate (1) or deactivate (0) the analog AGC"); -ABSL_FLAG(int, - vad, - kParameterNotSpecifiedValue, - "Activate (1) or deactivate (0) the voice activity detector"); ABSL_FLAG(bool, all_default, false, @@ -365,7 +361,6 @@ void SetSettingIfFlagSet(int32_t flag, absl::optional* parameter) { SimulationSettings CreateSettings() { SimulationSettings settings; if (absl::GetFlag(FLAGS_all_default)) { - settings.use_vad = true; settings.use_ts = true; settings.use_analog_agc = true; settings.use_ns = true; @@ -417,7 +412,6 @@ SimulationSettings CreateSettings() { SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts); SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc), &settings.use_analog_agc); - SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad); SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive), &settings.analog_agc_disable_digital_adaptive); SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode); diff --git a/modules/audio_processing/voice_detection.cc b/modules/audio_processing/voice_detection.cc deleted file mode 100644 index 1a633e2286..0000000000 --- a/modules/audio_processing/voice_detection.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "modules/audio_processing/voice_detection.h" - -#include "common_audio/vad/include/webrtc_vad.h" -#include "modules/audio_processing/audio_buffer.h" -#include "rtc_base/checks.h" - -namespace webrtc { -class VoiceDetection::Vad { - public: - Vad() { - state_ = WebRtcVad_Create(); - RTC_CHECK(state_); - int error = WebRtcVad_Init(state_); - RTC_DCHECK_EQ(0, error); - } - ~Vad() { WebRtcVad_Free(state_); } - - Vad(Vad&) = delete; - Vad& operator=(Vad&) = delete; - - VadInst* state() { return state_; } - - private: - VadInst* state_ = nullptr; -}; - -VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood) - : sample_rate_hz_(sample_rate_hz), - frame_size_samples_(static_cast(sample_rate_hz_ / 100)), - likelihood_(likelihood), - vad_(new Vad()) { - int mode = 2; - switch (likelihood) { - case VoiceDetection::kVeryLowLikelihood: - mode = 3; - break; - case VoiceDetection::kLowLikelihood: - mode = 2; - break; - case VoiceDetection::kModerateLikelihood: - mode = 1; - break; - case VoiceDetection::kHighLikelihood: - mode = 0; - break; - default: - RTC_DCHECK_NOTREACHED(); - break; - } - int error = WebRtcVad_set_mode(vad_->state(), mode); - RTC_DCHECK_EQ(0, error); -} - -VoiceDetection::~VoiceDetection() {} - -bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) { - RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength, - audio->num_frames_per_band()); - std::array mixed_low_pass_data; - rtc::ArrayView mixed_low_pass(mixed_low_pass_data.data(), - audio->num_frames_per_band()); - if (audio->num_channels() == 1) { - FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz], - audio->num_frames_per_band(), mixed_low_pass_data.data()); - } else { - const int num_channels = static_cast(audio->num_channels()); - for (size_t i = 0; i < audio->num_frames_per_band(); ++i) { - int32_t value = - FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]); - for (int j = 1; j < num_channels; ++j) { - value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]); - } - mixed_low_pass_data[i] = value / num_channels; - } - } - - int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_, - mixed_low_pass.data(), frame_size_samples_); - RTC_DCHECK(vad_ret == 0 || vad_ret == 1); - return vad_ret == 0 ? false : true; -} -} // namespace webrtc diff --git a/modules/audio_processing/voice_detection.h b/modules/audio_processing/voice_detection.h deleted file mode 100644 index 79d44e647c..0000000000 --- a/modules/audio_processing/voice_detection.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (c) 2019 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_ -#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_ - -#include - -#include - -#include "modules/audio_processing/include/audio_processing.h" - -namespace webrtc { - -class AudioBuffer; - -// The voice activity detection (VAD) component analyzes the stream to -// determine if voice is present. -class VoiceDetection { - public: - // Specifies the likelihood that a frame will be declared to contain voice. - // A higher value makes it more likely that speech will not be clipped, at - // the expense of more noise being detected as voice. - enum Likelihood { - kVeryLowLikelihood, - kLowLikelihood, - kModerateLikelihood, - kHighLikelihood - }; - - VoiceDetection(int sample_rate_hz, Likelihood likelihood); - ~VoiceDetection(); - - VoiceDetection(VoiceDetection&) = delete; - VoiceDetection& operator=(VoiceDetection&) = delete; - - // Returns true if voice is detected in the current frame. - bool ProcessCaptureAudio(AudioBuffer* audio); - - Likelihood likelihood() const { return likelihood_; } - - private: - class Vad; - - int sample_rate_hz_; - size_t frame_size_samples_; - Likelihood likelihood_; - std::unique_ptr vad_; -}; -} // namespace webrtc - -#endif // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_ diff --git a/modules/audio_processing/voice_detection_unittest.cc b/modules/audio_processing/voice_detection_unittest.cc deleted file mode 100644 index e1117e495d..0000000000 --- a/modules/audio_processing/voice_detection_unittest.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (c) 2016 The WebRTC project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include - -#include "api/array_view.h" -#include "modules/audio_processing/audio_buffer.h" -#include "modules/audio_processing/test/audio_buffer_tools.h" -#include "modules/audio_processing/test/bitexactness_tools.h" -#include "modules/audio_processing/voice_detection.h" -#include "test/gtest.h" - -namespace webrtc { -namespace { - -const int kNumFramesToProcess = 1000; - -// Process one frame of data and produce the output. -bool ProcessOneFrame(int sample_rate_hz, - AudioBuffer* audio_buffer, - VoiceDetection* voice_detection) { - if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) { - audio_buffer->SplitIntoFrequencyBands(); - } - - return voice_detection->ProcessCaptureAudio(audio_buffer); -} - -// Processes a specified amount of frames, verifies the results and reports -// any errors. -void RunBitexactnessTest(int sample_rate_hz, - size_t num_channels, - bool stream_has_voice_reference) { - int sample_rate_to_use = std::min(sample_rate_hz, 16000); - VoiceDetection voice_detection(sample_rate_to_use, - VoiceDetection::kLowLikelihood); - - int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100); - const StreamConfig capture_config(sample_rate_hz, num_channels); - AudioBuffer capture_buffer( - capture_config.sample_rate_hz(), capture_config.num_channels(), - capture_config.sample_rate_hz(), capture_config.num_channels(), - capture_config.sample_rate_hz(), capture_config.num_channels()); - test::InputAudioFile capture_file( - test::GetApmCaptureTestVectorFileName(sample_rate_hz)); - std::vector capture_input(samples_per_channel * num_channels); - bool stream_has_voice = false; - for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) { - ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels, - &capture_file, capture_input); - - test::CopyVectorToAudioBuffer(capture_config, capture_input, - &capture_buffer); - - stream_has_voice = - ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection); - } - - EXPECT_EQ(stream_has_voice_reference, stream_has_voice); -} - -const bool kStreamHasVoiceReference = true; - -} // namespace - -TEST(VoiceDetectionBitExactnessTest, Mono8kHz) { - RunBitexactnessTest(8000, 1, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Mono16kHz) { - RunBitexactnessTest(16000, 1, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Mono32kHz) { - RunBitexactnessTest(32000, 1, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Mono48kHz) { - RunBitexactnessTest(48000, 1, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) { - RunBitexactnessTest(8000, 2, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) { - RunBitexactnessTest(16000, 2, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) { - RunBitexactnessTest(32000, 2, kStreamHasVoiceReference); -} - -TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) { - RunBitexactnessTest(48000, 2, kStreamHasVoiceReference); -} - -} // namespace webrtc diff --git a/test/fuzzers/audio_processing_configs_fuzzer.cc b/test/fuzzers/audio_processing_configs_fuzzer.cc index 54a43dfe2d..f04ef773ac 100644 --- a/test/fuzzers/audio_processing_configs_fuzzer.cc +++ b/test/fuzzers/audio_processing_configs_fuzzer.cc @@ -54,7 +54,7 @@ rtc::scoped_refptr CreateApm(test::FuzzDataHelper* fuzz_data, bool use_agc = fuzz_data->ReadOrDefaultValue(true); bool use_ns = fuzz_data->ReadOrDefaultValue(true); static_cast(fuzz_data->ReadOrDefaultValue(true)); - bool use_vad = fuzz_data->ReadOrDefaultValue(true); + static_cast(fuzz_data->ReadOrDefaultValue(true)); bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true); bool use_agc2 = fuzz_data->ReadOrDefaultValue(true); @@ -114,7 +114,6 @@ rtc::scoped_refptr CreateApm(test::FuzzDataHelper* fuzz_data, use_agc2_adaptive_digital; apm_config.noise_suppression.enabled = use_ns; apm_config.transient_suppression.enabled = use_ts; - apm_config.voice_detection.enabled = use_vad; rtc::scoped_refptr apm = AudioProcessingBuilderForTesting()