webrtc/modules/audio_processing/transient/transient_suppressor_unittest.cc

/*
 *  Copyright (c) 2014 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/transient/transient_suppressor.h"

#include <vector>

#include "absl/types/optional.h"
#include "modules/audio_processing/transient/common.h"
#include "modules/audio_processing/transient/transient_suppressor_impl.h"
#include "test/gtest.h"

namespace webrtc {
namespace {
constexpr int kMono = 1;

// Returns the index of the first non-zero sample in `samples` or an unspecified
// value if no value is zero.
absl::optional<int> FindFirstNonZeroSample(const std::vector<float>& samples) {
  for (size_t i = 0; i < samples.size(); ++i) {
    if (samples[i] != 0.0f) {
      return i;
    }
  }
  return absl::nullopt;
}

}  // namespace

class TransientSuppressorVadModeParametrization
    : public ::testing::TestWithParam<TransientSuppressor::VadMode> {};

TEST_P(TransientSuppressorVadModeParametrization,
       TypingDetectionLogicWorksAsExpectedForMono) {
  TransientSuppressorImpl ts(GetParam(), ts::kSampleRate16kHz,
                             ts::kSampleRate16kHz, kMono);

  // Each key-press enables detection.
  EXPECT_FALSE(ts.detection_enabled_);
  ts.UpdateKeypress(true);
  EXPECT_TRUE(ts.detection_enabled_);

  // It takes four seconds without any key-press to disable the detection
  for (int time_ms = 0; time_ms < 3990; time_ms += ts::kChunkSizeMs) {
    ts.UpdateKeypress(false);
    EXPECT_TRUE(ts.detection_enabled_);
  }
  ts.UpdateKeypress(false);
  EXPECT_FALSE(ts.detection_enabled_);

  // Key-presses that are more than a second apart from each other don't enable
  // suppression.
  for (int i = 0; i < 100; ++i) {
    EXPECT_FALSE(ts.suppression_enabled_);
    ts.UpdateKeypress(true);
    EXPECT_TRUE(ts.detection_enabled_);
    EXPECT_FALSE(ts.suppression_enabled_);
    for (int time_ms = 0; time_ms < 990; time_ms += ts::kChunkSizeMs) {
      ts.UpdateKeypress(false);
      EXPECT_TRUE(ts.detection_enabled_);
      EXPECT_FALSE(ts.suppression_enabled_);
    }
    ts.UpdateKeypress(false);
  }

  // Two consecutive key-presses is enough to enable the suppression.
  ts.UpdateKeypress(true);
  EXPECT_FALSE(ts.suppression_enabled_);
  ts.UpdateKeypress(true);
  EXPECT_TRUE(ts.suppression_enabled_);

  // Key-presses that are less than a second apart from each other don't disable
  // detection nor suppression.
  for (int i = 0; i < 100; ++i) {
    for (int time_ms = 0; time_ms < 1000; time_ms += ts::kChunkSizeMs) {
      ts.UpdateKeypress(false);
      EXPECT_TRUE(ts.detection_enabled_);
      EXPECT_TRUE(ts.suppression_enabled_);
    }
    ts.UpdateKeypress(true);
    EXPECT_TRUE(ts.detection_enabled_);
    EXPECT_TRUE(ts.suppression_enabled_);
  }

  // It takes four seconds without any key-press to disable the detection and
  // suppression.
  for (int time_ms = 0; time_ms < 3990; time_ms += ts::kChunkSizeMs) {
    ts.UpdateKeypress(false);
    EXPECT_TRUE(ts.detection_enabled_);
    EXPECT_TRUE(ts.suppression_enabled_);
  }
  for (int time_ms = 0; time_ms < 1000; time_ms += ts::kChunkSizeMs) {
    ts.UpdateKeypress(false);
    EXPECT_FALSE(ts.detection_enabled_);
    EXPECT_FALSE(ts.suppression_enabled_);
  }
}

INSTANTIATE_TEST_SUITE_P(
    TransientSuppressorImplTest,
    TransientSuppressorVadModeParametrization,
    ::testing::Values(TransientSuppressor::VadMode::kDefault,
                      TransientSuppressor::VadMode::kRnnVad,
                      TransientSuppressor::VadMode::kNoVad));

class TransientSuppressorSampleRateParametrization
    : public ::testing::TestWithParam<int> {};

// Checks that voice probability and processed audio data are temporally aligned
// after `Suppress()` is called.
TEST_P(TransientSuppressorSampleRateParametrization,
       CheckAudioAndVoiceProbabilityTemporallyAligned) {
  const int sample_rate_hz = GetParam();
  TransientSuppressorImpl ts(TransientSuppressor::VadMode::kDefault,
                             sample_rate_hz,
                             /*detection_rate_hz=*/sample_rate_hz, kMono);

  const int frame_size = sample_rate_hz * ts::kChunkSizeMs / 1000;
  std::vector<float> frame(frame_size);

  constexpr int kMaxAttempts = 3;
  for (int i = 0; i < kMaxAttempts; ++i) {
    SCOPED_TRACE(i);

    // Call `Suppress()` on frames of non-zero audio samples.
    std::fill(frame.begin(), frame.end(), 1000.0f);
    float delayed_voice_probability = ts.Suppress(
        frame.data(), frame.size(), kMono, /*detection_data=*/nullptr,
        /*detection_length=*/frame_size, /*reference_data=*/nullptr,
        /*reference_length=*/frame_size, /*voice_probability=*/1.0f,
        /*key_pressed=*/false);

    // Detect the algorithmic delay of `TransientSuppressorImpl`.
    absl::optional<int> frame_delay = FindFirstNonZeroSample(frame);

    // Check that the delayed voice probability is delayed according to the
    // measured delay.
    if (frame_delay.has_value()) {
      if (*frame_delay == 0) {
        // When the delay is a multiple integer of the frame duration,
        // `Suppress()` returns a copy of a previously observed voice
        // probability value.
        EXPECT_EQ(delayed_voice_probability, 1.0f);
      } else {
        // Instead, when the delay is fractional, `Suppress()` returns an
        // interpolated value. Since the exact value depends on the
        // interpolation method, we only check that the delayed voice
        // probability is not zero as it must converge towards the previoulsy
        // observed value.
        EXPECT_GT(delayed_voice_probability, 0.0f);
      }
      break;
    } else {
      // The algorithmic delay is longer than the duration of a single frame.
      // Until the delay is detected, the delayed voice probability is zero.
      EXPECT_EQ(delayed_voice_probability, 0.0f);
    }
  }
}

INSTANTIATE_TEST_SUITE_P(TransientSuppressorImplTest,
                         TransientSuppressorSampleRateParametrization,
                         ::testing::Values(ts::kSampleRate8kHz,
                                           ts::kSampleRate16kHz,
                                           ts::kSampleRate32kHz,
                                           ts::kSampleRate48kHz));

}  // namespace webrtc