webrtc/modules/audio_processing/agc2/speech_level_estimator_unittest.cc

/*
 *  Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/agc2/speech_level_estimator.h"

#include <memory>

#include "api/audio/audio_processing.h"
#include "modules/audio_processing/agc2/agc2_common.h"
#include "modules/audio_processing/logging/apm_data_dumper.h"
#include "rtc_base/gunit.h"

namespace webrtc {
namespace {

using AdaptiveDigitalConfig =
    AudioProcessing::Config::GainController2::AdaptiveDigital;

// Number of speech frames that the level estimator must observe in order to
// become confident about the estimated level.
constexpr int kNumFramesToConfidence =
    kLevelEstimatorTimeToConfidenceMs / kFrameDurationMs;
static_assert(kNumFramesToConfidence > 0, "");

constexpr float kConvergenceSpeedTestsLevelTolerance = 0.5f;

// Provides the `vad_level` value `num_iterations` times to `level_estimator`.
void RunOnConstantLevel(int num_iterations,
                        float rms_dbfs,
                        float peak_dbfs,
                        float speech_probability,
                        SpeechLevelEstimator& level_estimator) {
  for (int i = 0; i < num_iterations; ++i) {
    level_estimator.Update(rms_dbfs, peak_dbfs, speech_probability);
  }
}

constexpr float kNoSpeechProbability = 0.0f;
constexpr float kLowSpeechProbability = kVadConfidenceThreshold / 2.0f;
constexpr float kMaxSpeechProbability = 1.0f;

// Level estimator with data dumper.
struct TestLevelEstimator {
  explicit TestLevelEstimator(int adjacent_speech_frames_threshold)
      : data_dumper(0),
        estimator(std::make_unique<SpeechLevelEstimator>(
            &data_dumper,
            AdaptiveDigitalConfig{},
            adjacent_speech_frames_threshold)),
        initial_speech_level_dbfs(estimator->level_dbfs()),
        level_rms_dbfs(initial_speech_level_dbfs / 2.0f),
        level_peak_dbfs(initial_speech_level_dbfs / 3.0f) {
    RTC_DCHECK_LT(level_rms_dbfs, level_peak_dbfs);
    RTC_DCHECK_LT(initial_speech_level_dbfs, level_rms_dbfs);
    RTC_DCHECK_GT(level_rms_dbfs - initial_speech_level_dbfs, 5.0f)
        << "Adjust `level_rms_dbfs` so that the difference from the initial "
           "level is wide enough for the tests";
  }
  ApmDataDumper data_dumper;
  std::unique_ptr<SpeechLevelEstimator> estimator;
  const float initial_speech_level_dbfs;
  const float level_rms_dbfs;
  const float level_peak_dbfs;
};

// Checks that the level estimator converges to a constant input speech level.
TEST(GainController2SpeechLevelEstimator, LevelStabilizes) {
  TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1);
  RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
                     level_estimator.level_rms_dbfs,
                     level_estimator.level_peak_dbfs, kMaxSpeechProbability,
                     *level_estimator.estimator);
  const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
  RunOnConstantLevel(/*num_iterations=*/1, level_estimator.level_rms_dbfs,
                     level_estimator.level_peak_dbfs, kMaxSpeechProbability,
                     *level_estimator.estimator);
  EXPECT_NEAR(level_estimator.estimator->level_dbfs(), estimated_level_dbfs,
              0.1f);
}

// Checks that the level controller does not become confident when too few
// speech frames are observed.
TEST(GainController2SpeechLevelEstimator, IsNotConfident) {
  TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1);
  RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence / 2,
                     level_estimator.level_rms_dbfs,
                     level_estimator.level_peak_dbfs, kMaxSpeechProbability,
                     *level_estimator.estimator);
  EXPECT_FALSE(level_estimator.estimator->is_confident());
}

// Checks that the level controller becomes confident when enough speech frames
// are observed.
TEST(GainController2SpeechLevelEstimator, IsConfident) {
  TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1);
  RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
                     level_estimator.level_rms_dbfs,
                     level_estimator.level_peak_dbfs, kMaxSpeechProbability,
                     *level_estimator.estimator);
  EXPECT_TRUE(level_estimator.estimator->is_confident());
}

// Checks that the estimated level is not affected by the level of non-speech
// frames.
TEST(GainController2SpeechLevelEstimator, EstimatorIgnoresNonSpeechFrames) {
  TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1);
  // Simulate speech.
  RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
                     level_estimator.level_rms_dbfs,
                     level_estimator.level_peak_dbfs, kMaxSpeechProbability,
                     *level_estimator.estimator);
  const float estimated_level_dbfs = level_estimator.estimator->level_dbfs();
  // Simulate full-scale non-speech.
  RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
                     /*rms_dbfs=*/0.0f, /*peak_dbfs=*/0.0f,
                     kNoSpeechProbability, *level_estimator.estimator);
  // No estimated level change is expected.
  EXPECT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
                  estimated_level_dbfs);
}

// Checks the convergence speed of the estimator before it becomes confident.
TEST(GainController2SpeechLevelEstimator, ConvergenceSpeedBeforeConfidence) {
  TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1);
  RunOnConstantLevel(/*num_iterations=*/kNumFramesToConfidence,
                     level_estimator.level_rms_dbfs,
                     level_estimator.level_peak_dbfs, kMaxSpeechProbability,
                     *level_estimator.estimator);
  EXPECT_NEAR(level_estimator.estimator->level_dbfs(),
              level_estimator.level_rms_dbfs,
              kConvergenceSpeedTestsLevelTolerance);
}

// Checks the convergence speed of the estimator after it becomes confident.
TEST(GainController2SpeechLevelEstimator, ConvergenceSpeedAfterConfidence) {
  TestLevelEstimator level_estimator(/*adjacent_speech_frames_threshold=*/1);
  // Reach confidence using the initial level estimate.
  RunOnConstantLevel(
      /*num_iterations=*/kNumFramesToConfidence,
      /*rms_dbfs=*/level_estimator.initial_speech_level_dbfs,
      /*peak_dbfs=*/level_estimator.initial_speech_level_dbfs + 6.0f,
      kMaxSpeechProbability, *level_estimator.estimator);
  // No estimate change should occur, but confidence is achieved.
  ASSERT_FLOAT_EQ(level_estimator.estimator->level_dbfs(),
                  level_estimator.initial_speech_level_dbfs);
  ASSERT_TRUE(level_estimator.estimator->is_confident());
  // After confidence.
  constexpr float kConvergenceTimeAfterConfidenceNumFrames = 700;  // 7 seconds.
  static_assert(
      kConvergenceTimeAfterConfidenceNumFrames > kNumFramesToConfidence, "");
  RunOnConstantLevel(
      /*num_iterations=*/kConvergenceTimeAfterConfidenceNumFrames,
      level_estimator.level_rms_dbfs, level_estimator.level_peak_dbfs,
      kMaxSpeechProbability, *level_estimator.estimator);
  EXPECT_NEAR(level_estimator.estimator->level_dbfs(),
              level_estimator.level_rms_dbfs,
              kConvergenceSpeedTestsLevelTolerance);
}

class SpeechLevelEstimatorParametrization
    : public ::testing::TestWithParam<int> {
 protected:
  int adjacent_speech_frames_threshold() const { return GetParam(); }
};

TEST_P(SpeechLevelEstimatorParametrization, DoNotAdaptToShortSpeechSegments) {
  TestLevelEstimator level_estimator(adjacent_speech_frames_threshold());
  const float initial_level = level_estimator.estimator->level_dbfs();
  ASSERT_LT(initial_level, level_estimator.level_peak_dbfs);
  for (int i = 0; i < adjacent_speech_frames_threshold() - 1; ++i) {
    SCOPED_TRACE(i);
    level_estimator.estimator->Update(level_estimator.level_rms_dbfs,
                                      level_estimator.level_peak_dbfs,
                                      kMaxSpeechProbability);
    EXPECT_EQ(initial_level, level_estimator.estimator->level_dbfs());
  }
  level_estimator.estimator->Update(level_estimator.level_rms_dbfs,
                                    level_estimator.level_peak_dbfs,
                                    kLowSpeechProbability);
  EXPECT_EQ(initial_level, level_estimator.estimator->level_dbfs());
}

TEST_P(SpeechLevelEstimatorParametrization, AdaptToEnoughSpeechSegments) {
  TestLevelEstimator level_estimator(adjacent_speech_frames_threshold());
  const float initial_level = level_estimator.estimator->level_dbfs();
  ASSERT_LT(initial_level, level_estimator.level_peak_dbfs);
  for (int i = 0; i < adjacent_speech_frames_threshold(); ++i) {
    level_estimator.estimator->Update(level_estimator.level_rms_dbfs,
                                      level_estimator.level_peak_dbfs,
                                      kMaxSpeechProbability);
  }
  EXPECT_LT(initial_level, level_estimator.estimator->level_dbfs());
}

INSTANTIATE_TEST_SUITE_P(GainController2,
                         SpeechLevelEstimatorParametrization,
                         ::testing::Values(1, 9, 17));

}  // namespace
}  // namespace webrtc