mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-15 23:01:21 +01:00

This CL defines SpectralFeaturesExtractor which is responsible for computing the spectral features used as input for the RNN. Bug: webrtc:9076 Change-Id: I5e1396b89eca9c13bb268e8419a16436a9c3450f Reviewed-on: https://webrtc-review.googlesource.com/73760 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Alex Loiko <aleloi@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#23206}
128 lines
4.8 KiB
C++
128 lines
4.8 KiB
C++
/*
|
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
|
|
#include "rtc_base/checks.h"
|
|
|
|
namespace webrtc {
|
|
namespace rnn_vad {
|
|
namespace {
|
|
|
|
// DCT scaling factor.
|
|
const float kDctScalingFactor = std::sqrt(2.f / kNumBands);
|
|
|
|
} // namespace
|
|
|
|
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
|
|
size_t sample_rate_hz,
|
|
size_t frame_size_samples) {
|
|
std::array<size_t, kNumBands> indexes;
|
|
for (size_t i = 0; i < kNumBands; ++i) {
|
|
indexes[i] =
|
|
kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz;
|
|
}
|
|
return indexes;
|
|
}
|
|
|
|
void ComputeBandCoefficients(
|
|
rtc::FunctionView<float(size_t)> functor,
|
|
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
|
size_t max_freq_bin_index,
|
|
rtc::ArrayView<float, kNumBands> coefficients) {
|
|
std::fill(coefficients.begin(), coefficients.end(), 0.f);
|
|
for (size_t i = 0; i < coefficients.size() - 1; ++i) {
|
|
RTC_DCHECK_EQ(0.f, coefficients[i + 1]);
|
|
RTC_DCHECK_GT(band_boundaries[i + 1], band_boundaries[i]);
|
|
const size_t first_freq_bin = band_boundaries[i];
|
|
const size_t last_freq_bin =
|
|
std::min(max_freq_bin_index, first_freq_bin + band_boundaries[i + 1] -
|
|
band_boundaries[i] - 1);
|
|
// Depending on the sample rate, the highest bands can have no FFT
|
|
// coefficients. Stop the iteration when coming across the first empty band.
|
|
if (first_freq_bin >= last_freq_bin)
|
|
break;
|
|
const size_t band_size = last_freq_bin - first_freq_bin + 1;
|
|
// Compute the band coefficient using a triangular band with peak response
|
|
// at the band boundary.
|
|
for (size_t j = first_freq_bin; j <= last_freq_bin; ++j) {
|
|
const float w = static_cast<float>(j - first_freq_bin) / band_size;
|
|
const float coefficient = functor(j);
|
|
coefficients[i] += (1.f - w) * coefficient;
|
|
coefficients[i + 1] += w * coefficient;
|
|
}
|
|
}
|
|
// The first and the last bands in the loop above only got half contribution.
|
|
coefficients[0] *= 2.f;
|
|
coefficients[coefficients.size() - 1] *= 2.f;
|
|
// TODO(bugs.webrtc.org/9076): Replace the line above with
|
|
// "coefficients[i] *= 2.f" (*) since we now assume that the last band is
|
|
// always |kNumBands| - 1.
|
|
// (*): "size_t i" must be declared before the main loop.
|
|
}
|
|
|
|
void ComputeBandEnergies(
|
|
rtc::ArrayView<const std::complex<float>> fft_coeffs,
|
|
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
|
rtc::ArrayView<float, kNumBands> band_energies) {
|
|
RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size());
|
|
auto functor = [fft_coeffs](const size_t freq_bin_index) -> float {
|
|
return std::norm(fft_coeffs[freq_bin_index]);
|
|
};
|
|
ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1,
|
|
band_energies);
|
|
}
|
|
|
|
void ComputeLogBandEnergiesCoefficients(
|
|
rtc::ArrayView<const float, kNumBands> band_energy_coeffs,
|
|
rtc::ArrayView<float, kNumBands> log_band_energy_coeffs) {
|
|
float log_max = -2.f;
|
|
float follow = -2.f;
|
|
for (size_t i = 0; i < band_energy_coeffs.size(); ++i) {
|
|
log_band_energy_coeffs[i] = std::log10(1e-2f + band_energy_coeffs[i]);
|
|
// Smoothing across frequency bands.
|
|
log_band_energy_coeffs[i] = std::max(
|
|
log_max - 7.f, std::max(follow - 1.5f, log_band_energy_coeffs[i]));
|
|
log_max = std::max(log_max, log_band_energy_coeffs[i]);
|
|
follow = std::max(follow - 1.5f, log_band_energy_coeffs[i]);
|
|
}
|
|
}
|
|
|
|
std::array<float, kNumBands * kNumBands> ComputeDctTable() {
|
|
std::array<float, kNumBands * kNumBands> dct_table;
|
|
const double k = std::sqrt(0.5);
|
|
for (size_t i = 0; i < kNumBands; ++i) {
|
|
for (size_t j = 0; j < kNumBands; ++j)
|
|
dct_table[i * kNumBands + j] = std::cos((i + 0.5) * j * kPi / kNumBands);
|
|
dct_table[i * kNumBands] *= k;
|
|
}
|
|
return dct_table;
|
|
}
|
|
|
|
void ComputeDct(rtc::ArrayView<const float, kNumBands> in,
|
|
rtc::ArrayView<const float, kNumBands * kNumBands> dct_table,
|
|
rtc::ArrayView<float> out) {
|
|
RTC_DCHECK_NE(in.data(), out.data()) << "In-place DCT is not supported.";
|
|
RTC_DCHECK_LE(1, out.size());
|
|
RTC_DCHECK_LE(out.size(), in.size());
|
|
std::fill(out.begin(), out.end(), 0.f);
|
|
for (size_t i = 0; i < out.size(); ++i) {
|
|
for (size_t j = 0; j < in.size(); ++j) {
|
|
out[i] += in[j] * dct_table[j * in.size() + i];
|
|
}
|
|
out[i] *= kDctScalingFactor;
|
|
}
|
|
}
|
|
|
|
} // namespace rnn_vad
|
|
} // namespace webrtc
|