Reland "Remove unused APM voice activity detection sub-module"

This reverts commit a751f167c6. Reason for revert: dependency in a downstream project removed Original change's description: > Revert "Remove unused APM voice activity detection sub-module" > > This reverts commit b4e06d032e. > > Reason for revert: breaking downstream projects > > Original change's description: > > Remove unused APM voice activity detection sub-module > > > > API changes: > > - webrtc::AudioProcessing::Config::VoiceDetection removed > > - webrtc::AudioProcessingStats::voice_detected deprecated > > - cricket::AudioOptions::typing_detection deprecated > > - webrtc::StatsReport::StatsValueName:: > > kStatsValueNameTypingNoiseState deprecated > > > > PSA: https://groups.google.com/g/discuss-webrtc/c/7X6uwmJarE0 > > > > Bug: webrtc:11226,webrtc:11292 > > Change-Id: I8d008b56708cf62961b9857ec052b59fda3b41bf > > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/250666 > > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > > Reviewed-by: Sam Zackrisson <saza@webrtc.org> > > Reviewed-by: Björn Terelius <terelius@webrtc.org> > > Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> > > Cr-Commit-Position: refs/heads/main@{#35975} > > TBR=gustaf@webrtc.org,saza@webrtc.org,alessiob@webrtc.org,terelius@webrtc.org,hta@webrtc.org,webrtc-scoped@luci-project-accounts.iam.gserviceaccount.com > > Change-Id: Iee01fdb874b4e0331277f3ffe60dacaabc3859a2 > No-Presubmit: true > No-Tree-Checks: true > No-Try: true > Bug: webrtc:11226,webrtc:11292 > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251600 > Reviewed-by: Harald Alvestrand <hta@webrtc.org> > Reviewed-by: Gustaf Ullberg <gustaf@webrtc.org> > Commit-Queue: Mirko Bonadei <mbonadei@webrtc.org> > Cr-Commit-Position: refs/heads/main@{#35977} # Not skipping CQ checks because this is a reland. Bug: webrtc:11226,webrtc:11292 Change-Id: I2fcbc5fdade16bfe6a0f0a02841a33a598d4f2ad Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/251660 Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Harald Alvestrand <hta@webrtc.org> Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Cr-Commit-Position: refs/heads/main@{#35984}
2025-05-12 21:30:45 +01:00 · 2022-02-12 08:11:51 +00:00 · 2022-02-12 08:11:51 +00:00 · 54d1344d98
commit 54d1344d98
parent f342d6054a
23 changed files with 18 additions and 483 deletions
--- a/api/audio_options.h
+++ b/api/audio_options.h
@ -60,6 +60,8 @@ struct RTC_EXPORT AudioOptions {
  absl::optional<int> audio_jitter_buffer_min_delay_ms;
  // Audio receiver jitter buffer (NetEq) should handle retransmitted packets.
  absl::optional<bool> audio_jitter_buffer_enable_rtx_handling;
+  // Deprecated.
+  // TODO(bugs.webrtc.org/11226): Remove.
  // Audio processing to detect typing.
  absl::optional<bool> typing_detection;
  absl::optional<bool> experimental_agc;
--- a/api/stats_types.cc
+++ b/api/stats_types.cc
@ -648,6 +648,7 @@ const char* StatsReport::Value::display_name() const {
      return "googTrackId";
    case kStatsValueNameTimingFrameInfo:
      return "googTimingFrameInfo";
+    // TODO(bugs.webrtc.org/11226): Remove.
    case kStatsValueNameTypingNoiseState:
      return "googTypingNoiseState";
    case kStatsValueNameWritable:
--- a/api/stats_types.h
+++ b/api/stats_types.h
@ -235,6 +235,7 @@ class RTC_EXPORT StatsReport {
    kStatsValueNameTrackId,
    kStatsValueNameTransmitBitrate,
    kStatsValueNameTransportType,
+    // TODO(bugs.webrtc.org/11226): Remove.
    kStatsValueNameTypingNoiseState,
    kStatsValueNameWritable,
    kStatsValueNameAudioDeviceUnderrunCounter,
--- a/audio/audio_transport_impl.cc
+++ b/audio/audio_transport_impl.cc
@ -165,24 +165,6 @@ int32_t AudioTransportImpl::RecordedDataIsAvailable(
                      audio_frame.get());
  audio_frame->set_absolute_capture_timestamp_ms(estimated_capture_time_ns /
                                                 1000000);
-  // Typing detection (utilizes the APM/VAD decision). We let the VAD determine
-  // if we're using this feature or not.
-  // TODO(solenberg): GetConfig() takes a lock. Work around that.
-  bool typing_detected = false;
-  if (audio_processing_ &&
-      audio_processing_->GetConfig().voice_detection.enabled) {
-    if (audio_frame->vad_activity_ != AudioFrame::kVadUnknown) {
-      bool vad_active = audio_frame->vad_activity_ == AudioFrame::kVadActive;
-      typing_detected = typing_detection_.Process(key_pressed, vad_active);
-    }
-  }
-
-  // Copy frame and push to each sending stream. The copy is required since an
-  // encoding task will be posted internally to each stream.
-  {
-    MutexLock lock(&capture_lock_);
-    typing_noise_detected_ = typing_detected;
-  }

  RTC_DCHECK_GT(audio_frame->samples_per_channel_, 0);
  if (async_audio_processing_)
@ -290,8 +272,4 @@ void AudioTransportImpl::SetStereoChannelSwapping(bool enable) {
  swap_stereo_channels_ = enable;
 }

-bool AudioTransportImpl::typing_noise_detected() const {
-  MutexLock lock(&capture_lock_);
-  return typing_noise_detected_;
-}
 }  // namespace webrtc
--- a/audio/audio_transport_impl.h
+++ b/audio/audio_transport_impl.h
@ -86,7 +86,9 @@ class AudioTransportImpl : public AudioTransport {
                          int send_sample_rate_hz,
                          size_t send_num_channels);
  void SetStereoChannelSwapping(bool enable);
-  bool typing_noise_detected() const;
+  // Deprecated.
+  // TODO(bugs.webrtc.org/11226): Remove.
+  bool typing_noise_detected() const { return false; }

 private:
  void SendProcessedData(std::unique_ptr<AudioFrame> audio_frame);
@ -103,7 +105,6 @@ class AudioTransportImpl : public AudioTransport {
  std::vector<AudioSender*> audio_senders_ RTC_GUARDED_BY(capture_lock_);
  int send_sample_rate_hz_ RTC_GUARDED_BY(capture_lock_) = 8000;
  size_t send_num_channels_ RTC_GUARDED_BY(capture_lock_) = 1;
-  bool typing_noise_detected_ RTC_GUARDED_BY(capture_lock_) = false;
  bool swap_stereo_channels_ RTC_GUARDED_BY(capture_lock_) = false;
  PushResampler<int16_t> capture_resampler_;
  TypingDetection typing_detection_;
--- a/media/engine/webrtc_voice_engine.cc
+++ b/media/engine/webrtc_voice_engine.cc
@ -634,9 +634,7 @@ bool WebRtcVoiceEngine::ApplyOptions(const AudioOptions& options_in) {
  }

  if (options.typing_detection) {
-    RTC_LOG(LS_INFO) << "Typing detection is enabled? "
-                     << *options.typing_detection;
-    apm_config.voice_detection.enabled = *options.typing_detection;
+    RTC_LOG(LS_WARNING) << "Typing detection is requested, but unsupported.";
  }

  ap->ApplyConfig(apm_config);
--- a/media/engine/webrtc_voice_engine_unittest.cc
+++ b/media/engine/webrtc_voice_engine_unittest.cc
@ -221,11 +221,6 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam<bool> {
      // Default Options.
      VerifyEchoCancellationSettings(/*enabled=*/true);
      EXPECT_TRUE(IsHighPassFilterEnabled());
-#if defined(WEBRTC_ANDROID)
-      EXPECT_FALSE(IsTypingDetectionEnabled());
-#else
-      EXPECT_TRUE(IsTypingDetectionEnabled());
-#endif
      EXPECT_TRUE(apm_config_.noise_suppression.enabled);
      EXPECT_EQ(apm_config_.noise_suppression.level, kDefaultNsLevel);
      VerifyGainControlEnabledCorrectly();
@ -793,10 +788,6 @@ class WebRtcVoiceEngineTestFake : public ::testing::TestWithParam<bool> {
    return apm_config_.high_pass_filter.enabled;
  }

-  bool IsTypingDetectionEnabled() {
-    return apm_config_.voice_detection.enabled;
-  }
-
 protected:
  const bool use_null_apm_;
  std::unique_ptr<webrtc::TaskQueueFactory> task_queue_factory_;
@ -3041,40 +3032,10 @@ TEST_P(WebRtcVoiceEngineTestFake, SetAudioOptions) {
  if (!use_null_apm_) {
    VerifyEchoCancellationSettings(/*enabled=*/true);
    EXPECT_TRUE(IsHighPassFilterEnabled());
-#if defined(WEBRTC_ANDROID)
-    EXPECT_FALSE(IsTypingDetectionEnabled());
-#else
-    EXPECT_TRUE(IsTypingDetectionEnabled());
-#endif
  }
  EXPECT_EQ(200u, GetRecvStreamConfig(kSsrcY).jitter_buffer_max_packets);
  EXPECT_FALSE(GetRecvStreamConfig(kSsrcY).jitter_buffer_fast_accelerate);

-  // Turn typing detection off.
-  send_parameters_.options.typing_detection = false;
-  SetSendParameters(send_parameters_);
-  if (!use_null_apm_) {
-    EXPECT_FALSE(IsTypingDetectionEnabled());
-  }
-
-  // Leave typing detection unchanged, but non-default.
-  send_parameters_.options.typing_detection = absl::nullopt;
-  SetSendParameters(send_parameters_);
-  if (!use_null_apm_) {
-    EXPECT_FALSE(IsTypingDetectionEnabled());
-  }
-
-  // Turn typing detection on.
-  send_parameters_.options.typing_detection = true;
-  SetSendParameters(send_parameters_);
-  if (!use_null_apm_) {
-#if defined(WEBRTC_ANDROID)
-    EXPECT_FALSE(IsTypingDetectionEnabled());
-#else
-    EXPECT_TRUE(IsTypingDetectionEnabled());
-#endif
-  }
-
  // Turn echo cancellation off
  send_parameters_.options.echo_cancellation = false;
  SetSendParameters(send_parameters_);
--- a/modules/audio_processing/BUILD.gn
+++ b/modules/audio_processing/BUILD.gn
@ -168,7 +168,6 @@ rtc_library("audio_processing") {
    ":high_pass_filter",
    ":optionally_built_submodule_creators",
    ":rms_level",
-    ":voice_detection",
    "../../api:array_view",
    "../../api:function_view",
    "../../api/audio:aec3_config",
@ -218,20 +217,6 @@ rtc_library("audio_processing") {
  }
 }

-rtc_library("voice_detection") {
-  sources = [
-    "voice_detection.cc",
-    "voice_detection.h",
-  ]
-  deps = [
-    ":api",
-    ":audio_buffer",
-    "../../api/audio:audio_frame_api",
-    "../../common_audio:common_audio_c",
-    "../../rtc_base:checks",
-  ]
-}
-
 rtc_library("residual_echo_detector") {
  poisonous = [ "default_echo_detector" ]
  configs += [ ":apm_debug_dump" ]
@ -379,7 +364,6 @@ if (rtc_include_tests) {
        ":gain_controller2",
        ":high_pass_filter",
        ":mocks",
-        ":voice_detection",
        "../../api:array_view",
        "../../api:scoped_refptr",
        "../../api/audio:aec3_config",
@ -474,7 +458,6 @@ if (rtc_include_tests) {
          "test/echo_canceller_test_tools_unittest.cc",
          "test/echo_control_mock.h",
          "test/test_utils.h",
-          "voice_detection_unittest.cc",
        ]
      }
    }
--- a/modules/audio_processing/audio_processing_impl.cc
+++ b/modules/audio_processing/audio_processing_impl.cc
@ -141,7 +141,6 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
    bool gain_controller2_enabled,
    bool gain_adjustment_enabled,
    bool echo_controller_enabled,
-    bool voice_detector_enabled,
    bool transient_suppressor_enabled) {
  bool changed = false;
  changed |= (high_pass_filter_enabled != high_pass_filter_enabled_);
@ -153,7 +152,6 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
  changed |= (gain_controller2_enabled != gain_controller2_enabled_);
  changed |= (gain_adjustment_enabled != gain_adjustment_enabled_);
  changed |= (echo_controller_enabled != echo_controller_enabled_);
-  changed |= (voice_detector_enabled != voice_detector_enabled_);
  changed |= (transient_suppressor_enabled != transient_suppressor_enabled_);
  if (changed) {
    high_pass_filter_enabled_ = high_pass_filter_enabled;
@ -163,7 +161,6 @@ bool AudioProcessingImpl::SubmoduleStates::Update(
    gain_controller2_enabled_ = gain_controller2_enabled;
    gain_adjustment_enabled_ = gain_adjustment_enabled;
    echo_controller_enabled_ = echo_controller_enabled;
-    voice_detector_enabled_ = voice_detector_enabled;
    transient_suppressor_enabled_ = transient_suppressor_enabled;
  }

@ -174,7 +171,7 @@ bool AudioProcessingImpl::SubmoduleStates::Update(

 bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandSubModulesActive()
    const {
-  return CaptureMultiBandProcessingPresent() || voice_detector_enabled_;
+  return CaptureMultiBandProcessingPresent();
 }

 bool AudioProcessingImpl::SubmoduleStates::CaptureMultiBandProcessingPresent()
@ -371,7 +368,6 @@ void AudioProcessingImpl::InitializeLocked() {
  InitializeGainController1();
  InitializeTransientSuppressor();
  InitializeHighPassFilter(true);
-  InitializeVoiceDetector();
  InitializeResidualEchoDetector();
  InitializeEchoController();
  InitializeGainController2(/*config_has_changed=*/true);
@ -506,9 +502,6 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
  const bool agc2_config_changed =
      config_.gain_controller2 != config.gain_controller2;

-  const bool voice_detection_config_changed =
-      config_.voice_detection.enabled != config.voice_detection.enabled;
-
  const bool ns_config_changed =
      config_.noise_suppression.enabled != config.noise_suppression.enabled ||
      config_.noise_suppression.level != config.noise_suppression.level;
@ -557,10 +550,6 @@ void AudioProcessingImpl::ApplyConfig(const AudioProcessing::Config& config) {
    InitializeCaptureLevelsAdjuster();
  }

-  if (voice_detection_config_changed) {
-    InitializeVoiceDetector();
-  }
-
  // Reinitialization must happen after all submodule configuration to avoid
  // additional reinitializations on the next capture / render processing call.
  if (pipeline_config_changed) {
@ -1215,13 +1204,6 @@ int AudioProcessingImpl::ProcessCaptureStreamLocked() {
    }
  }

-  if (config_.voice_detection.enabled) {
-    capture_.stats.voice_detected =
-        submodules_.voice_detector->ProcessCaptureAudio(capture_buffer);
-  } else {
-    capture_.stats.voice_detected = absl::nullopt;
-  }
-
  if (submodules_.agc_manager) {
    submodules_.agc_manager->Process(capture_buffer);

@ -1682,7 +1664,7 @@ bool AudioProcessingImpl::UpdateActiveSubmoduleStates() {
      !!submodules_.gain_controller2,
      config_.pre_amplifier.enabled || config_.capture_level_adjustment.enabled,
      capture_nonlocked_.echo_controller_enabled,
-      config_.voice_detection.enabled, !!submodules_.transient_suppressor);
+      !!submodules_.transient_suppressor);
 }

 void AudioProcessingImpl::InitializeTransientSuppressor() {
@ -1732,14 +1714,6 @@ void AudioProcessingImpl::InitializeHighPassFilter(bool forced_reset) {
  }
 }

-void AudioProcessingImpl::InitializeVoiceDetector() {
-  if (config_.voice_detection.enabled) {
-    submodules_.voice_detector = std::make_unique<VoiceDetection>(
-        proc_split_sample_rate_hz(), VoiceDetection::kVeryLowLikelihood);
-  } else {
-    submodules_.voice_detector.reset();
-  }
-}
 void AudioProcessingImpl::InitializeEchoController() {
  bool use_echo_controller =
      echo_control_factory_ ||
--- a/modules/audio_processing/audio_processing_impl.h
+++ b/modules/audio_processing/audio_processing_impl.h
@ -39,7 +39,6 @@
 #include "modules/audio_processing/render_queue_item_verifier.h"
 #include "modules/audio_processing/rms_level.h"
 #include "modules/audio_processing/transient/transient_suppressor.h"
-#include "modules/audio_processing/voice_detection.h"
 #include "rtc_base/gtest_prod_util.h"
 #include "rtc_base/ignore_wundef.h"
 #include "rtc_base/swap_queue.h"
@ -208,7 +207,6 @@ class AudioProcessingImpl : public AudioProcessing {
                bool gain_controller2_enabled,
                bool gain_adjustment_enabled,
                bool echo_controller_enabled,
-                bool voice_detector_enabled,
                bool transient_suppressor_enabled);
    bool CaptureMultiBandSubModulesActive() const;
    bool CaptureMultiBandProcessingPresent() const;
@ -231,7 +229,6 @@ class AudioProcessingImpl : public AudioProcessing {
    bool gain_controller2_enabled_ = false;
    bool gain_adjustment_enabled_ = false;
    bool echo_controller_enabled_ = false;
-    bool voice_detector_enabled_ = false;
    bool transient_suppressor_enabled_ = false;
    bool first_update_ = true;
  };
@ -267,7 +264,6 @@ class AudioProcessingImpl : public AudioProcessing {
  // already acquired.
  void InitializeHighPassFilter(bool forced_reset)
      RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
-  void InitializeVoiceDetector() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
  void InitializeGainController1() RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
  void InitializeTransientSuppressor()
      RTC_EXCLUSIVE_LOCKS_REQUIRED(mutex_capture_);
@ -400,7 +396,6 @@ class AudioProcessingImpl : public AudioProcessing {
    std::unique_ptr<EchoControlMobileImpl> echo_control_mobile;
    std::unique_ptr<NoiseSuppressor> noise_suppressor;
    std::unique_ptr<TransientSuppressor> transient_suppressor;
-    std::unique_ptr<VoiceDetection> voice_detector;
    std::unique_ptr<CaptureLevelsAdjuster> capture_levels_adjuster;
  } submodules_;

--- a/modules/audio_processing/audio_processing_impl_locking_unittest.cc
+++ b/modules/audio_processing/audio_processing_impl_locking_unittest.cc
@ -483,7 +483,6 @@ AudioProcessing::Config GetApmTestConfig(AecType aec_type) {
  apm_config.gain_controller1.mode =
      AudioProcessing::Config::GainController1::kAdaptiveDigital;
  apm_config.noise_suppression.enabled = true;
-  apm_config.voice_detection.enabled = true;
  return apm_config;
 }

--- a/modules/audio_processing/audio_processing_performance_unittest.cc
+++ b/modules/audio_processing/audio_processing_performance_unittest.cc
@ -441,7 +441,6 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
      apm_config.gain_controller1.enabled = true;
      apm_config.gain_controller1.mode =
          AudioProcessing::Config::GainController1::kAdaptiveDigital;
-      apm_config.voice_detection.enabled = true;
      apm->ApplyConfig(apm_config);
    };

@ -453,7 +452,6 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
      apm_config.noise_suppression.enabled = true;
      apm_config.gain_controller1.mode =
          AudioProcessing::Config::GainController1::kAdaptiveDigital;
-      apm_config.voice_detection.enabled = true;
      apm->ApplyConfig(apm_config);
    };

@ -464,7 +462,6 @@ class CallSimulator : public ::testing::TestWithParam<SimulationConfig> {
      apm_config.echo_canceller.enabled = false;
      apm_config.gain_controller1.enabled = false;
      apm_config.noise_suppression.enabled = false;
-      apm_config.voice_detection.enabled = false;
      apm->ApplyConfig(apm_config);
    };

--- a/modules/audio_processing/audio_processing_unittest.cc
+++ b/modules/audio_processing/audio_processing_unittest.cc
@ -190,7 +190,6 @@ void EnableAllAPComponents(AudioProcessing* ap) {
  apm_config.noise_suppression.enabled = true;

  apm_config.high_pass_filter.enabled = true;
-  apm_config.voice_detection.enabled = true;
  apm_config.pipeline.maximum_internal_processing_rate = 48000;
  ap->ApplyConfig(apm_config);
 }
@ -1226,7 +1225,6 @@ TEST_F(ApmTest, AllProcessingDisabledByDefault) {
  EXPECT_FALSE(config.high_pass_filter.enabled);
  EXPECT_FALSE(config.gain_controller1.enabled);
  EXPECT_FALSE(config.noise_suppression.enabled);
-  EXPECT_FALSE(config.voice_detection.enabled);
 }

 TEST_F(ApmTest, NoProcessingWhenAllComponentsDisabled) {
@ -1367,48 +1365,6 @@ TEST_F(ApmTest, SplittingFilter) {
  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
  apm_->ApplyConfig(apm_config);

-  // 3. Only GetStatistics-reporting VAD is enabled...
-  SetFrameTo(&frame_, 1000);
-  frame_copy.CopyFrom(frame_);
-  apm_config.voice_detection.enabled = true;
-  apm_->ApplyConfig(apm_config);
-  EXPECT_EQ(apm_->kNoError,
-            apm_->ProcessStream(
-                frame_.data.data(),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                frame_.data.data()));
-  EXPECT_EQ(apm_->kNoError,
-            apm_->ProcessStream(
-                frame_.data.data(),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                frame_.data.data()));
-  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
-  apm_config.voice_detection.enabled = false;
-  apm_->ApplyConfig(apm_config);
-
-  // 4. The VAD is enabled...
-  SetFrameTo(&frame_, 1000);
-  frame_copy.CopyFrom(frame_);
-  apm_config.voice_detection.enabled = true;
-  apm_->ApplyConfig(apm_config);
-  EXPECT_EQ(apm_->kNoError,
-            apm_->ProcessStream(
-                frame_.data.data(),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                frame_.data.data()));
-  EXPECT_EQ(apm_->kNoError,
-            apm_->ProcessStream(
-                frame_.data.data(),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                StreamConfig(frame_.sample_rate_hz, frame_.num_channels),
-                frame_.data.data()));
-  EXPECT_TRUE(FrameDataAreEqual(frame_, frame_copy));
-  apm_config.voice_detection.enabled = false;
-  apm_->ApplyConfig(apm_config);
-
  // Check the test is valid. We should have distortion from the filter
  // when AEC is enabled (which won't affect the audio).
  apm_config.echo_canceller.enabled = true;
@ -1736,7 +1692,6 @@ TEST_F(ApmTest, Process) {
         static_cast<size_t>(test->num_reverse_channels()), true);

    int frame_count = 0;
-    int has_voice_count = 0;
    int analog_level = 127;
    int analog_level_average = 0;
    int max_output_average = 0;
@ -1772,8 +1727,6 @@ TEST_F(ApmTest, Process) {
      analog_level = apm_->recommended_stream_analog_level();
      analog_level_average += analog_level;
      AudioProcessingStats stats = apm_->GetStatistics();
-      EXPECT_TRUE(stats.voice_detected);
-      has_voice_count += *stats.voice_detected ? 1 : 0;

      size_t frame_size = frame_.samples_per_channel * frame_.num_channels;
      size_t write_count =
@ -1829,33 +1782,23 @@ TEST_F(ApmTest, Process) {

    if (!absl::GetFlag(FLAGS_write_apm_ref_data)) {
      const int kIntNear = 1;
-      // When running the test on a N7 we get a {2, 6} difference of
-      // `has_voice_count` and `max_output_average` is up to 18 higher.
-      // All numbers being consistently higher on N7 compare to ref_data.
+      // All numbers being consistently higher on N7 compare to the reference
+      // data.
      // TODO(bjornv): If we start getting more of these offsets on Android we
      // should consider a different approach. Either using one slack for all,
      // or generate a separate android reference.
 #if defined(WEBRTC_ANDROID) || defined(WEBRTC_IOS)
-      const int kHasVoiceCountOffset = 3;
-      const int kHasVoiceCountNear = 8;
      const int kMaxOutputAverageOffset = 9;
      const int kMaxOutputAverageNear = 26;
 #else
-      const int kHasVoiceCountOffset = 0;
-      const int kHasVoiceCountNear = kIntNear;
      const int kMaxOutputAverageOffset = 0;
      const int kMaxOutputAverageNear = kIntNear;
 #endif
-      EXPECT_NEAR(test->has_voice_count(),
-                  has_voice_count - kHasVoiceCountOffset, kHasVoiceCountNear);
-
      EXPECT_NEAR(test->analog_level_average(), analog_level_average, kIntNear);
      EXPECT_NEAR(test->max_output_average(),
                  max_output_average - kMaxOutputAverageOffset,
                  kMaxOutputAverageNear);
    } else {
-      test->set_has_voice_count(has_voice_count);
-
      test->set_analog_level_average(analog_level_average);
      test->set_max_output_average(max_output_average);
    }
@ -2685,7 +2628,6 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(bool mobile_aec) {
  apm_config.echo_canceller.enabled = true;
  apm_config.echo_canceller.mobile_mode = mobile_aec;
  apm_config.noise_suppression.enabled = false;
-  apm_config.voice_detection.enabled = false;
  apm->ApplyConfig(apm_config);
  return apm;
 }
@ -2794,10 +2736,9 @@ TEST(MAYBE_ApmStatistics, AECMEnabledTest) {
  EXPECT_FALSE(stats.echo_return_loss_enhancement.has_value());
 }

-TEST(ApmStatistics, ReportHasVoice) {
+TEST(ApmStatistics, DoNotReportVoiceDetectedStat) {
  ProcessingConfig processing_config = {
      {{32000, 1}, {32000, 1}, {32000, 1}, {32000, 1}}};
-  AudioProcessing::Config config;

  // Set up an audioframe.
  Int16FrameData frame;
@ -2814,37 +2755,14 @@ TEST(ApmStatistics, ReportHasVoice) {
      AudioProcessingBuilderForTesting().Create();
  apm->Initialize(processing_config);

-  // If not enabled, no metric should be reported.
+  // No metric should be reported.
  EXPECT_EQ(
      apm->ProcessStream(frame.data.data(),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
                         frame.data.data()),
      0);
-  EXPECT_FALSE(apm->GetStatistics().voice_detected);
-
-  // If enabled, metrics should be reported.
-  config.voice_detection.enabled = true;
-  apm->ApplyConfig(config);
-  EXPECT_EQ(
-      apm->ProcessStream(frame.data.data(),
-                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
-                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
-                         frame.data.data()),
-      0);
-  auto stats = apm->GetStatistics();
-  EXPECT_TRUE(stats.voice_detected);
-
-  // If re-disabled, the value is again not reported.
-  config.voice_detection.enabled = false;
-  apm->ApplyConfig(config);
-  EXPECT_EQ(
-      apm->ProcessStream(frame.data.data(),
-                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
-                         StreamConfig(frame.sample_rate_hz, frame.num_channels),
-                         frame.data.data()),
-      0);
-  EXPECT_FALSE(apm->GetStatistics().voice_detected);
+  EXPECT_FALSE(apm->GetStatistics().voice_detected.has_value());
 }

 TEST(ApmStatistics, GetStatisticsReportsNoEchoDetectorStatsWhenDisabled) {
--- a/modules/audio_processing/include/audio_processing.cc
+++ b/modules/audio_processing/include/audio_processing.cc
@ -145,7 +145,6 @@ std::string AudioProcessing::Config::ToString() const {
          << NoiseSuppressionLevelToString(noise_suppression.level)
          << " }, transient_suppression: { enabled: "
          << transient_suppression.enabled
-          << " }, voice_detection: { enabled: " << voice_detection.enabled
          << " }, gain_controller1: { enabled: " << gain_controller1.enabled
          << ", mode: " << GainController1ModeToString(gain_controller1.mode)
          << ", target_level_dbfs: " << gain_controller1.target_level_dbfs
--- a/modules/audio_processing/include/audio_processing.h
+++ b/modules/audio_processing/include/audio_processing.h
@ -113,8 +113,6 @@ static constexpr int kClippedLevelMin = 70;
 //
 // config.high_pass_filter.enabled = true;
 //
-// config.voice_detection.enabled = true;
-//
 // apm->ApplyConfig(config)
 //
 // apm->noise_reduction()->set_level(kHighSuppression);
@ -232,11 +230,6 @@ class RTC_EXPORT AudioProcessing : public rtc::RefCountInterface {
      bool enabled = false;
    } transient_suppression;

-    // Enables reporting of `voice_detected` in webrtc::AudioProcessingStats.
-    struct VoiceDetection {
-      bool enabled = false;
-    } voice_detection;
-
    // Enables automatic gain control (AGC) functionality.
    // The automatic gain control (AGC) component brings the signal to an
    // appropriate range. This is done by applying a digital gain directly and,
--- a/modules/audio_processing/include/audio_processing_statistics.h
+++ b/modules/audio_processing/include/audio_processing_statistics.h
@ -24,6 +24,8 @@ struct RTC_EXPORT AudioProcessingStats {
  AudioProcessingStats(const AudioProcessingStats& other);
  ~AudioProcessingStats();

+  // Deprecated.
+  // TODO(bugs.webrtc.org/11226): Remove.
  // True if voice is detected in the last capture frame, after processing.
  // It is conservative in flagging audio as speech, with low likelihood of
  // incorrectly flagging a frame as voice.
--- a/modules/audio_processing/test/audio_processing_simulator.cc
+++ b/modules/audio_processing/test/audio_processing_simulator.cc
@ -543,10 +543,6 @@ void AudioProcessingSimulator::ConfigureAudioProcessor() {
    apm_config.high_pass_filter.enabled = *settings_.use_hpf;
  }

-  if (settings_.use_vad) {
-    apm_config.voice_detection.enabled = *settings_.use_vad;
-  }
-
  if (settings_.use_agc) {
    apm_config.gain_controller1.enabled = *settings_.use_agc;
  }
--- a/modules/audio_processing/test/audio_processing_simulator.h
+++ b/modules/audio_processing/test/audio_processing_simulator.h
@ -105,7 +105,6 @@ struct SimulationSettings {
  absl::optional<bool> use_ns;
  absl::optional<int> use_ts;
  absl::optional<bool> use_analog_agc;
-  absl::optional<bool> use_vad;
  absl::optional<bool> use_all;
  absl::optional<bool> analog_agc_disable_digital_adaptive;
  absl::optional<int> agc_mode;
--- a/modules/audio_processing/test/audioproc_float_impl.cc
+++ b/modules/audio_processing/test/audioproc_float_impl.cc
@ -117,10 +117,6 @@ ABSL_FLAG(int,
          analog_agc,
          kParameterNotSpecifiedValue,
          "Activate (1) or deactivate (0) the analog AGC");
-ABSL_FLAG(int,
-          vad,
-          kParameterNotSpecifiedValue,
-          "Activate (1) or deactivate (0) the voice activity detector");
 ABSL_FLAG(bool,
          all_default,
          false,
@ -365,7 +361,6 @@ void SetSettingIfFlagSet(int32_t flag, absl::optional<bool>* parameter) {
 SimulationSettings CreateSettings() {
  SimulationSettings settings;
  if (absl::GetFlag(FLAGS_all_default)) {
-    settings.use_vad = true;
    settings.use_ts = true;
    settings.use_analog_agc = true;
    settings.use_ns = true;
@ -417,7 +412,6 @@ SimulationSettings CreateSettings() {
  SetSettingIfSpecified(absl::GetFlag(FLAGS_ts), &settings.use_ts);
  SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc),
                      &settings.use_analog_agc);
-  SetSettingIfFlagSet(absl::GetFlag(FLAGS_vad), &settings.use_vad);
  SetSettingIfFlagSet(absl::GetFlag(FLAGS_analog_agc_disable_digital_adaptive),
                      &settings.analog_agc_disable_digital_adaptive);
  SetSettingIfSpecified(absl::GetFlag(FLAGS_agc_mode), &settings.agc_mode);
--- a/modules/audio_processing/voice_detection.cc
+++ b/modules/audio_processing/voice_detection.cc
@ -1,92 +0,0 @@
-/*
- *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "modules/audio_processing/voice_detection.h"
-
-#include "common_audio/vad/include/webrtc_vad.h"
-#include "modules/audio_processing/audio_buffer.h"
-#include "rtc_base/checks.h"
-
-namespace webrtc {
-class VoiceDetection::Vad {
- public:
-  Vad() {
-    state_ = WebRtcVad_Create();
-    RTC_CHECK(state_);
-    int error = WebRtcVad_Init(state_);
-    RTC_DCHECK_EQ(0, error);
-  }
-  ~Vad() { WebRtcVad_Free(state_); }
-
-  Vad(Vad&) = delete;
-  Vad& operator=(Vad&) = delete;
-
-  VadInst* state() { return state_; }
-
- private:
-  VadInst* state_ = nullptr;
-};
-
-VoiceDetection::VoiceDetection(int sample_rate_hz, Likelihood likelihood)
-    : sample_rate_hz_(sample_rate_hz),
-      frame_size_samples_(static_cast<size_t>(sample_rate_hz_ / 100)),
-      likelihood_(likelihood),
-      vad_(new Vad()) {
-  int mode = 2;
-  switch (likelihood) {
-    case VoiceDetection::kVeryLowLikelihood:
-      mode = 3;
-      break;
-    case VoiceDetection::kLowLikelihood:
-      mode = 2;
-      break;
-    case VoiceDetection::kModerateLikelihood:
-      mode = 1;
-      break;
-    case VoiceDetection::kHighLikelihood:
-      mode = 0;
-      break;
-    default:
-      RTC_DCHECK_NOTREACHED();
-      break;
-  }
-  int error = WebRtcVad_set_mode(vad_->state(), mode);
-  RTC_DCHECK_EQ(0, error);
-}
-
-VoiceDetection::~VoiceDetection() {}
-
-bool VoiceDetection::ProcessCaptureAudio(AudioBuffer* audio) {
-  RTC_DCHECK_GE(AudioBuffer::kMaxSplitFrameLength,
-                audio->num_frames_per_band());
-  std::array<int16_t, AudioBuffer::kMaxSplitFrameLength> mixed_low_pass_data;
-  rtc::ArrayView<const int16_t> mixed_low_pass(mixed_low_pass_data.data(),
-                                               audio->num_frames_per_band());
-  if (audio->num_channels() == 1) {
-    FloatS16ToS16(audio->split_bands_const(0)[kBand0To8kHz],
-                  audio->num_frames_per_band(), mixed_low_pass_data.data());
-  } else {
-    const int num_channels = static_cast<int>(audio->num_channels());
-    for (size_t i = 0; i < audio->num_frames_per_band(); ++i) {
-      int32_t value =
-          FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[0][i]);
-      for (int j = 1; j < num_channels; ++j) {
-        value += FloatS16ToS16(audio->split_channels_const(kBand0To8kHz)[j][i]);
-      }
-      mixed_low_pass_data[i] = value / num_channels;
-    }
-  }
-
-  int vad_ret = WebRtcVad_Process(vad_->state(), sample_rate_hz_,
-                                  mixed_low_pass.data(), frame_size_samples_);
-  RTC_DCHECK(vad_ret == 0 || vad_ret == 1);
-  return vad_ret == 0 ? false : true;
-}
-}  // namespace webrtc
--- a/modules/audio_processing/voice_detection.h
+++ b/modules/audio_processing/voice_detection.h
@ -1,59 +0,0 @@
-/*
- *  Copyright (c) 2019 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
-#define MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
-
-#include <stddef.h>
-
-#include <memory>
-
-#include "modules/audio_processing/include/audio_processing.h"
-
-namespace webrtc {
-
-class AudioBuffer;
-
-// The voice activity detection (VAD) component analyzes the stream to
-// determine if voice is present.
-class VoiceDetection {
- public:
-  // Specifies the likelihood that a frame will be declared to contain voice.
-  // A higher value makes it more likely that speech will not be clipped, at
-  // the expense of more noise being detected as voice.
-  enum Likelihood {
-    kVeryLowLikelihood,
-    kLowLikelihood,
-    kModerateLikelihood,
-    kHighLikelihood
-  };
-
-  VoiceDetection(int sample_rate_hz, Likelihood likelihood);
-  ~VoiceDetection();
-
-  VoiceDetection(VoiceDetection&) = delete;
-  VoiceDetection& operator=(VoiceDetection&) = delete;
-
-  // Returns true if voice is detected in the current frame.
-  bool ProcessCaptureAudio(AudioBuffer* audio);
-
-  Likelihood likelihood() const { return likelihood_; }
-
- private:
-  class Vad;
-
-  int sample_rate_hz_;
-  size_t frame_size_samples_;
-  Likelihood likelihood_;
-  std::unique_ptr<Vad> vad_;
-};
-}  // namespace webrtc
-
-#endif  // MODULES_AUDIO_PROCESSING_VOICE_DETECTION_H_
--- a/modules/audio_processing/voice_detection_unittest.cc
+++ b/modules/audio_processing/voice_detection_unittest.cc
@ -1,104 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebRTC project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include <vector>
-
-#include "api/array_view.h"
-#include "modules/audio_processing/audio_buffer.h"
-#include "modules/audio_processing/test/audio_buffer_tools.h"
-#include "modules/audio_processing/test/bitexactness_tools.h"
-#include "modules/audio_processing/voice_detection.h"
-#include "test/gtest.h"
-
-namespace webrtc {
-namespace {
-
-const int kNumFramesToProcess = 1000;
-
-// Process one frame of data and produce the output.
-bool ProcessOneFrame(int sample_rate_hz,
-                     AudioBuffer* audio_buffer,
-                     VoiceDetection* voice_detection) {
-  if (sample_rate_hz > AudioProcessing::kSampleRate16kHz) {
-    audio_buffer->SplitIntoFrequencyBands();
-  }
-
-  return voice_detection->ProcessCaptureAudio(audio_buffer);
-}
-
-// Processes a specified amount of frames, verifies the results and reports
-// any errors.
-void RunBitexactnessTest(int sample_rate_hz,
-                         size_t num_channels,
-                         bool stream_has_voice_reference) {
-  int sample_rate_to_use = std::min(sample_rate_hz, 16000);
-  VoiceDetection voice_detection(sample_rate_to_use,
-                                 VoiceDetection::kLowLikelihood);
-
-  int samples_per_channel = rtc::CheckedDivExact(sample_rate_hz, 100);
-  const StreamConfig capture_config(sample_rate_hz, num_channels);
-  AudioBuffer capture_buffer(
-      capture_config.sample_rate_hz(), capture_config.num_channels(),
-      capture_config.sample_rate_hz(), capture_config.num_channels(),
-      capture_config.sample_rate_hz(), capture_config.num_channels());
-  test::InputAudioFile capture_file(
-      test::GetApmCaptureTestVectorFileName(sample_rate_hz));
-  std::vector<float> capture_input(samples_per_channel * num_channels);
-  bool stream_has_voice = false;
-  for (int frame_no = 0; frame_no < kNumFramesToProcess; ++frame_no) {
-    ReadFloatSamplesFromStereoFile(samples_per_channel, num_channels,
-                                   &capture_file, capture_input);
-
-    test::CopyVectorToAudioBuffer(capture_config, capture_input,
-                                  &capture_buffer);
-
-    stream_has_voice =
-        ProcessOneFrame(sample_rate_hz, &capture_buffer, &voice_detection);
-  }
-
-  EXPECT_EQ(stream_has_voice_reference, stream_has_voice);
-}
-
-const bool kStreamHasVoiceReference = true;
-
-}  // namespace
-
-TEST(VoiceDetectionBitExactnessTest, Mono8kHz) {
-  RunBitexactnessTest(8000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Mono16kHz) {
-  RunBitexactnessTest(16000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Mono32kHz) {
-  RunBitexactnessTest(32000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Mono48kHz) {
-  RunBitexactnessTest(48000, 1, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo8kHz) {
-  RunBitexactnessTest(8000, 2, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo16kHz) {
-  RunBitexactnessTest(16000, 2, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo32kHz) {
-  RunBitexactnessTest(32000, 2, kStreamHasVoiceReference);
-}
-
-TEST(VoiceDetectionBitExactnessTest, Stereo48kHz) {
-  RunBitexactnessTest(48000, 2, kStreamHasVoiceReference);
-}
-
-}  // namespace webrtc
--- a/test/fuzzers/audio_processing_configs_fuzzer.cc
+++ b/test/fuzzers/audio_processing_configs_fuzzer.cc
@ -54,7 +54,7 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
  bool use_agc = fuzz_data->ReadOrDefaultValue(true);
  bool use_ns = fuzz_data->ReadOrDefaultValue(true);
  static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
-  bool use_vad = fuzz_data->ReadOrDefaultValue(true);
+  static_cast<void>(fuzz_data->ReadOrDefaultValue(true));
  bool use_agc_limiter = fuzz_data->ReadOrDefaultValue(true);
  bool use_agc2 = fuzz_data->ReadOrDefaultValue(true);

@ -114,7 +114,6 @@ rtc::scoped_refptr<AudioProcessing> CreateApm(test::FuzzDataHelper* fuzz_data,
      use_agc2_adaptive_digital;
  apm_config.noise_suppression.enabled = use_ns;
  apm_config.transient_suppression.enabled = use_ts;
-  apm_config.voice_detection.enabled = use_vad;

  rtc::scoped_refptr<AudioProcessing> apm =
      AudioProcessingBuilderForTesting()