webrtc/modules/audio_processing/aec3/residual_echo_estimator.cc

/*
 *  Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "modules/audio_processing/aec3/residual_echo_estimator.h"

#include <stddef.h>

#include <algorithm>
#include <vector>

#include "api/array_view.h"
#include "modules/audio_processing/aec3/reverb_model.h"
#include "rtc_base/checks.h"
#include "system_wrappers/include/field_trial.h"

namespace webrtc {
namespace {

constexpr float kDefaultTransparentModeGain = 0.01f;

float GetTransparentModeGain() {
  return kDefaultTransparentModeGain;
}

float GetEarlyReflectionsDefaultModeGain(
    const EchoCanceller3Config::EpStrength& config) {
  if (field_trial::IsEnabled("WebRTC-Aec3UseLowEarlyReflectionsDefaultGain")) {
    return 0.1f;
  }
  return config.default_gain;
}

float GetLateReflectionsDefaultModeGain(
    const EchoCanceller3Config::EpStrength& config) {
  if (field_trial::IsEnabled("WebRTC-Aec3UseLowLateReflectionsDefaultGain")) {
    return 0.1f;
  }
  return config.default_gain;
}

bool UseErleOnsetCompensationInDominantNearend(
    const EchoCanceller3Config::EpStrength& config) {
  return config.erle_onset_compensation_in_dominant_nearend ||
         field_trial::IsEnabled(
             "WebRTC-Aec3UseErleOnsetCompensationInDominantNearend");
}

// Computes the indexes that will be used for computing spectral power over
// the blocks surrounding the delay.
void GetRenderIndexesToAnalyze(
    const SpectrumBuffer& spectrum_buffer,
    const EchoCanceller3Config::EchoModel& echo_model,
    int filter_delay_blocks,
    int* idx_start,
    int* idx_stop) {
  RTC_DCHECK(idx_start);
  RTC_DCHECK(idx_stop);
  size_t window_start;
  size_t window_end;
  window_start =
      std::max(0, filter_delay_blocks -
                      static_cast<int>(echo_model.render_pre_window_size));
  window_end = filter_delay_blocks +
               static_cast<int>(echo_model.render_post_window_size);
  *idx_start = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_start);
  *idx_stop = spectrum_buffer.OffsetIndex(spectrum_buffer.read, window_end + 1);
}

// Estimates the residual echo power based on the echo return loss enhancement
// (ERLE) and the linear power estimate.
void LinearEstimate(
    rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
    rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> erle,
    rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
  RTC_DCHECK_EQ(S2_linear.size(), erle.size());
  RTC_DCHECK_EQ(S2_linear.size(), R2.size());

  const size_t num_capture_channels = R2.size();
  for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
      RTC_DCHECK_LT(0.f, erle[ch][k]);
      R2[ch][k] = S2_linear[ch][k] / erle[ch][k];
    }
  }
}

// Estimates the residual echo power based on the estimate of the echo path
// gain.
void NonLinearEstimate(
    float echo_path_gain,
    const std::array<float, kFftLengthBy2Plus1>& X2,
    rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) {
  const size_t num_capture_channels = R2.size();
  for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
      R2[ch][k] = X2[k] * echo_path_gain;
    }
  }
}

// Applies a soft noise gate to the echo generating power.
void ApplyNoiseGate(const EchoCanceller3Config::EchoModel& config,
                    rtc::ArrayView<float, kFftLengthBy2Plus1> X2) {
  for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    if (config.noise_gate_power > X2[k]) {
      X2[k] = std::max(0.f, X2[k] - config.noise_gate_slope *
                                        (config.noise_gate_power - X2[k]));
    }
  }
}

// Estimates the echo generating signal power as gated maximal power over a
// time window.
void EchoGeneratingPower(size_t num_render_channels,
                         const SpectrumBuffer& spectrum_buffer,
                         const EchoCanceller3Config::EchoModel& echo_model,
                         int filter_delay_blocks,
                         rtc::ArrayView<float, kFftLengthBy2Plus1> X2) {
  int idx_stop;
  int idx_start;
  GetRenderIndexesToAnalyze(spectrum_buffer, echo_model, filter_delay_blocks,
                            &idx_start, &idx_stop);

  std::fill(X2.begin(), X2.end(), 0.f);
  if (num_render_channels == 1) {
    for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
      for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
        X2[j] = std::max(X2[j], spectrum_buffer.buffer[k][/*channel=*/0][j]);
      }
    }
  } else {
    for (int k = idx_start; k != idx_stop; k = spectrum_buffer.IncIndex(k)) {
      std::array<float, kFftLengthBy2Plus1> render_power;
      render_power.fill(0.f);
      for (size_t ch = 0; ch < num_render_channels; ++ch) {
        const auto& channel_power = spectrum_buffer.buffer[k][ch];
        for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
          render_power[j] += channel_power[j];
        }
      }
      for (size_t j = 0; j < kFftLengthBy2Plus1; ++j) {
        X2[j] = std::max(X2[j], render_power[j]);
      }
    }
  }
}

}  // namespace

ResidualEchoEstimator::ResidualEchoEstimator(const EchoCanceller3Config& config,
                                             size_t num_render_channels)
    : config_(config),
      num_render_channels_(num_render_channels),
      early_reflections_transparent_mode_gain_(GetTransparentModeGain()),
      late_reflections_transparent_mode_gain_(GetTransparentModeGain()),
      early_reflections_general_gain_(
          GetEarlyReflectionsDefaultModeGain(config_.ep_strength)),
      late_reflections_general_gain_(
          GetLateReflectionsDefaultModeGain(config_.ep_strength)),
      erle_onset_compensation_in_dominant_nearend_(
          UseErleOnsetCompensationInDominantNearend(config_.ep_strength)) {
  Reset();
}

ResidualEchoEstimator::~ResidualEchoEstimator() = default;

void ResidualEchoEstimator::Estimate(
    const AecState& aec_state,
    const RenderBuffer& render_buffer,
    rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> S2_linear,
    rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> Y2,
    bool dominant_nearend,
    rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2,
    rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2_unbounded) {
  RTC_DCHECK_EQ(R2.size(), Y2.size());
  RTC_DCHECK_EQ(R2.size(), S2_linear.size());

  const size_t num_capture_channels = R2.size();

  // Estimate the power of the stationary noise in the render signal.
  UpdateRenderNoisePower(render_buffer);

  // Estimate the residual echo power.
  if (aec_state.UsableLinearEstimate()) {
    // When there is saturated echo, assume the same spectral content as is
    // present in the microphone signal.
    if (aec_state.SaturatedEcho()) {
      for (size_t ch = 0; ch < num_capture_channels; ++ch) {
        std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
        std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
      }
    } else {
      const bool onset_compensated =
          erle_onset_compensation_in_dominant_nearend_ || !dominant_nearend;
      LinearEstimate(S2_linear, aec_state.Erle(onset_compensated), R2);
      LinearEstimate(S2_linear, aec_state.ErleUnbounded(), R2_unbounded);
    }

    UpdateReverb(ReverbType::kLinear, aec_state, render_buffer);
    AddReverb(R2);
    AddReverb(R2_unbounded);
  } else {
    const float echo_path_gain =
        GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/true);

    // When there is saturated echo, assume the same spectral content as is
    // present in the microphone signal.
    if (aec_state.SaturatedEcho()) {
      for (size_t ch = 0; ch < num_capture_channels; ++ch) {
        std::copy(Y2[ch].begin(), Y2[ch].end(), R2[ch].begin());
        std::copy(Y2[ch].begin(), Y2[ch].end(), R2_unbounded[ch].begin());
      }
    } else {
      // Estimate the echo generating signal power.
      std::array<float, kFftLengthBy2Plus1> X2;
      EchoGeneratingPower(num_render_channels_,
                          render_buffer.GetSpectrumBuffer(), config_.echo_model,
                          aec_state.MinDirectPathFilterDelay(), X2);
      if (!aec_state.UseStationarityProperties()) {
        ApplyNoiseGate(config_.echo_model, X2);
      }

      // Subtract the stationary noise power to avoid stationary noise causing
      // excessive echo suppression.
      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
        X2[k] -= config_.echo_model.stationary_gate_slope * X2_noise_floor_[k];
        X2[k] = std::max(0.f, X2[k]);
      }

      NonLinearEstimate(echo_path_gain, X2, R2);
      NonLinearEstimate(echo_path_gain, X2, R2_unbounded);
    }

    if (config_.echo_model.model_reverb_in_nonlinear_mode &&
        !aec_state.TransparentModeActive()) {
      UpdateReverb(ReverbType::kNonLinear, aec_state, render_buffer);
      AddReverb(R2);
      AddReverb(R2_unbounded);
    }
  }

  if (aec_state.UseStationarityProperties()) {
    // Scale the echo according to echo audibility.
    std::array<float, kFftLengthBy2Plus1> residual_scaling;
    aec_state.GetResidualEchoScaling(residual_scaling);
    for (size_t ch = 0; ch < num_capture_channels; ++ch) {
      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
        R2[ch][k] *= residual_scaling[k];
        R2_unbounded[ch][k] *= residual_scaling[k];
      }
    }
  }
}

void ResidualEchoEstimator::Reset() {
  echo_reverb_.Reset();
  X2_noise_floor_counter_.fill(config_.echo_model.noise_floor_hold);
  X2_noise_floor_.fill(config_.echo_model.min_noise_floor_power);
}

void ResidualEchoEstimator::UpdateRenderNoisePower(
    const RenderBuffer& render_buffer) {
  std::array<float, kFftLengthBy2Plus1> render_power_data;
  rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
      render_buffer.Spectrum(0);
  rtc::ArrayView<const float, kFftLengthBy2Plus1> render_power =
      X2[/*channel=*/0];
  if (num_render_channels_ > 1) {
    render_power_data.fill(0.f);
    for (size_t ch = 0; ch < num_render_channels_; ++ch) {
      const auto& channel_power = X2[ch];
      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
        render_power_data[k] += channel_power[k];
      }
    }
    render_power = render_power_data;
  }

  // Estimate the stationary noise power in a minimum statistics manner.
  for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
    // Decrease rapidly.
    if (render_power[k] < X2_noise_floor_[k]) {
      X2_noise_floor_[k] = render_power[k];
      X2_noise_floor_counter_[k] = 0;
    } else {
      // Increase in a delayed, leaky manner.
      if (X2_noise_floor_counter_[k] >=
          static_cast<int>(config_.echo_model.noise_floor_hold)) {
        X2_noise_floor_[k] = std::max(X2_noise_floor_[k] * 1.1f,
                                      config_.echo_model.min_noise_floor_power);
      } else {
        ++X2_noise_floor_counter_[k];
      }
    }
  }
}

// Updates the reverb estimation.
void ResidualEchoEstimator::UpdateReverb(ReverbType reverb_type,
                                         const AecState& aec_state,
                                         const RenderBuffer& render_buffer) {
  // Choose reverb partition based on what type of echo power model is used.
  const size_t first_reverb_partition =
      reverb_type == ReverbType::kLinear
          ? aec_state.FilterLengthBlocks() + 1
          : aec_state.MinDirectPathFilterDelay() + 1;

  // Compute render power for the reverb.
  std::array<float, kFftLengthBy2Plus1> render_power_data;
  rtc::ArrayView<const std::array<float, kFftLengthBy2Plus1>> X2 =
      render_buffer.Spectrum(first_reverb_partition);
  rtc::ArrayView<const float, kFftLengthBy2Plus1> render_power =
      X2[/*channel=*/0];
  if (num_render_channels_ > 1) {
    render_power_data.fill(0.f);
    for (size_t ch = 0; ch < num_render_channels_; ++ch) {
      const auto& channel_power = X2[ch];
      for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
        render_power_data[k] += channel_power[k];
      }
    }
    render_power = render_power_data;
  }

  // Update the reverb estimate.
  if (reverb_type == ReverbType::kLinear) {
    echo_reverb_.UpdateReverb(render_power,
                              aec_state.GetReverbFrequencyResponse(),
                              aec_state.ReverbDecay());
  } else {
    const float echo_path_gain =
        GetEchoPathGain(aec_state, /*gain_for_early_reflections=*/false);
    echo_reverb_.UpdateReverbNoFreqShaping(render_power, echo_path_gain,
                                           aec_state.ReverbDecay());
  }
}
// Adds the estimated power of the reverb to the residual echo power.
void ResidualEchoEstimator::AddReverb(
    rtc::ArrayView<std::array<float, kFftLengthBy2Plus1>> R2) const {
  const size_t num_capture_channels = R2.size();

  // Add the reverb power.
  rtc::ArrayView<const float, kFftLengthBy2Plus1> reverb_power =
      echo_reverb_.reverb();
  for (size_t ch = 0; ch < num_capture_channels; ++ch) {
    for (size_t k = 0; k < kFftLengthBy2Plus1; ++k) {
      R2[ch][k] += reverb_power[k];
    }
  }
}

// Chooses the echo path gain to use.
float ResidualEchoEstimator::GetEchoPathGain(
    const AecState& aec_state,
    bool gain_for_early_reflections) const {
  float gain_amplitude;
  if (aec_state.TransparentModeActive()) {
    gain_amplitude = gain_for_early_reflections
                         ? early_reflections_transparent_mode_gain_
                         : late_reflections_transparent_mode_gain_;
  } else {
    gain_amplitude = gain_for_early_reflections
                         ? early_reflections_general_gain_
                         : late_reflections_general_gain_;
  }
  return gain_amplitude * gain_amplitude;
}

}  // namespace webrtc