mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-18 08:07:56 +01:00
AGC2 RNN VAD: Spectral features extraction.
This CL defines SpectralFeaturesExtractor which is responsible for computing the spectral features used as input for the RNN. Bug: webrtc:9076 Change-Id: I5e1396b89eca9c13bb268e8419a16436a9c3450f Reviewed-on: https://webrtc-review.googlesource.com/73760 Commit-Queue: Alessio Bazzica <alessiob@webrtc.org> Reviewed-by: Alex Loiko <aleloi@webrtc.org> Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Cr-Commit-Position: refs/heads/master@{#23206}
This commit is contained in:
parent
739351d476
commit
bc0b37c08a
9 changed files with 473 additions and 20 deletions
|
@ -30,6 +30,8 @@ source_set("lib") {
|
||||||
"rnn.cc",
|
"rnn.cc",
|
||||||
"rnn.h",
|
"rnn.h",
|
||||||
"sequence_buffer.h",
|
"sequence_buffer.h",
|
||||||
|
"spectral_features.cc",
|
||||||
|
"spectral_features.h",
|
||||||
"spectral_features_internal.cc",
|
"spectral_features_internal.cc",
|
||||||
"spectral_features_internal.h",
|
"spectral_features_internal.h",
|
||||||
"symmetric_matrix_buffer.h",
|
"symmetric_matrix_buffer.h",
|
||||||
|
@ -90,6 +92,7 @@ if (rtc_include_tests) {
|
||||||
"rnn_unittest.cc",
|
"rnn_unittest.cc",
|
||||||
"sequence_buffer_unittest.cc",
|
"sequence_buffer_unittest.cc",
|
||||||
"spectral_features_internal_unittest.cc",
|
"spectral_features_internal_unittest.cc",
|
||||||
|
"spectral_features_unittest.cc",
|
||||||
"symmetric_matrix_buffer_unittest.cc",
|
"symmetric_matrix_buffer_unittest.cc",
|
||||||
]
|
]
|
||||||
deps = [
|
deps = [
|
||||||
|
|
|
@ -51,6 +51,14 @@ constexpr int kBandFrequencyBoundaries[kNumBands] = {
|
||||||
0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400,
|
0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 2000, 2400,
|
||||||
2800, 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
|
2800, 3200, 4000, 4800, 5600, 6800, 8000, 9600, 12000, 15600, 20000};
|
||||||
|
|
||||||
|
// Feature extraction parameters.
|
||||||
|
constexpr size_t kNumLowerBands = 6;
|
||||||
|
static_assert((0 < kNumLowerBands) && (kNumLowerBands < kNumBands), "");
|
||||||
|
constexpr size_t kSpectralCoeffsHistorySize = 8;
|
||||||
|
static_assert(kSpectralCoeffsHistorySize > 2,
|
||||||
|
"The history size must at least be 3 to compute first and second "
|
||||||
|
"derivatives.");
|
||||||
|
|
||||||
constexpr size_t kFeatureVectorSize = 42;
|
constexpr size_t kFeatureVectorSize = 42;
|
||||||
|
|
||||||
} // namespace rnn_vad
|
} // namespace rnn_vad
|
||||||
|
|
|
@ -14,6 +14,10 @@
|
||||||
namespace webrtc {
|
namespace webrtc {
|
||||||
namespace rnn_vad {
|
namespace rnn_vad {
|
||||||
|
|
||||||
|
// TODO(bugs.webrtc.org/9076): To decrease the stack size, add a class that uses
|
||||||
|
// std::vector instances instead of the local arrays used in PitchSearch(). It
|
||||||
|
// is also useful once https://webrtc-review.googlesource.com/c/src/+/73366
|
||||||
|
// lands.
|
||||||
PitchInfo PitchSearch(rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
|
PitchInfo PitchSearch(rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
|
||||||
PitchInfo prev_pitch_48kHz,
|
PitchInfo prev_pitch_48kHz,
|
||||||
RealFourier* fft) {
|
RealFourier* fft) {
|
||||||
|
|
|
@ -72,6 +72,7 @@ class SequenceBuffer {
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
// TODO(bugs.webrtc.org/9076): Switch to std::vector to decrease stack size.
|
||||||
std::array<T, S> buffer_;
|
std::array<T, S> buffer_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
210
modules/audio_processing/agc2/rnn_vad/spectral_features.cc
Normal file
210
modules/audio_processing/agc2/rnn_vad/spectral_features.cc
Normal file
|
@ -0,0 +1,210 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/spectral_features.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <cmath>
|
||||||
|
#include <limits>
|
||||||
|
#include <numeric>
|
||||||
|
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/spectral_features_internal.h"
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
namespace rnn_vad {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
constexpr float kSilenceThreshold = 0.04f;
|
||||||
|
|
||||||
|
// Computes the new spectral difference stats and pushes them into the passed
|
||||||
|
// symmetric matrix buffer.
|
||||||
|
void UpdateSpectralDifferenceStats(
|
||||||
|
rtc::ArrayView<const float, kNumBands> new_spectral_coeffs,
|
||||||
|
const RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>& ring_buf,
|
||||||
|
SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize>* sym_matrix_buf) {
|
||||||
|
RTC_DCHECK(sym_matrix_buf);
|
||||||
|
// Compute the new spectral distance stats.
|
||||||
|
std::array<float, kSpectralCoeffsHistorySize - 1> distances;
|
||||||
|
for (size_t i = 0; i < kSpectralCoeffsHistorySize - 1; ++i) {
|
||||||
|
const size_t delay = i + 1;
|
||||||
|
auto old_spectral_coeffs = ring_buf.GetArrayView(delay);
|
||||||
|
distances[i] = 0.f;
|
||||||
|
for (size_t k = 0; k < kNumBands; ++k) {
|
||||||
|
const float c = new_spectral_coeffs[k] - old_spectral_coeffs[k];
|
||||||
|
distances[i] += c * c;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Push the new spectral distance stats into the symmetric matrix buffer.
|
||||||
|
sym_matrix_buf->Push({distances.data(), distances.size()});
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
SpectralFeaturesView::SpectralFeaturesView(
|
||||||
|
rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> average,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> second_derivative,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> cross_correlations,
|
||||||
|
float* variability)
|
||||||
|
: coeffs(coeffs),
|
||||||
|
average(average),
|
||||||
|
first_derivative(first_derivative),
|
||||||
|
second_derivative(second_derivative),
|
||||||
|
cross_correlations(cross_correlations),
|
||||||
|
variability(variability) {}
|
||||||
|
|
||||||
|
SpectralFeaturesView::SpectralFeaturesView(const SpectralFeaturesView&) =
|
||||||
|
default;
|
||||||
|
SpectralFeaturesView::~SpectralFeaturesView() = default;
|
||||||
|
|
||||||
|
SpectralFeaturesExtractor::SpectralFeaturesExtractor()
|
||||||
|
: fft_(),
|
||||||
|
reference_frame_fft_(kFrameSize20ms24kHz),
|
||||||
|
lagged_frame_fft_(kFrameSize20ms24kHz),
|
||||||
|
band_boundaries_(
|
||||||
|
ComputeBandBoundaryIndexes(kSampleRate24kHz, kFrameSize20ms24kHz)),
|
||||||
|
dct_table_(ComputeDctTable()) {}
|
||||||
|
|
||||||
|
SpectralFeaturesExtractor::~SpectralFeaturesExtractor() = default;
|
||||||
|
|
||||||
|
void SpectralFeaturesExtractor::Reset() {
|
||||||
|
spectral_coeffs_ring_buf_.Reset();
|
||||||
|
spectral_diffs_buf_.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool SpectralFeaturesExtractor::CheckSilenceComputeFeatures(
|
||||||
|
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
|
||||||
|
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
|
||||||
|
SpectralFeaturesView spectral_features) {
|
||||||
|
// Analyze reference frame.
|
||||||
|
fft_.ForwardFft(reference_frame, reference_frame_fft_);
|
||||||
|
ComputeBandEnergies(reference_frame_fft_,
|
||||||
|
{band_boundaries_.data(), band_boundaries_.size()},
|
||||||
|
{reference_frame_energy_coeffs_.data(),
|
||||||
|
reference_frame_energy_coeffs_.size()});
|
||||||
|
// Check if the reference frame has silence.
|
||||||
|
const float tot_energy =
|
||||||
|
std::accumulate(reference_frame_energy_coeffs_.begin(),
|
||||||
|
reference_frame_energy_coeffs_.end(), 0.f);
|
||||||
|
if (tot_energy < kSilenceThreshold)
|
||||||
|
return true;
|
||||||
|
// Analyze lagged frame.
|
||||||
|
fft_.ForwardFft(lagged_frame, lagged_frame_fft_);
|
||||||
|
ComputeBandEnergies(
|
||||||
|
lagged_frame_fft_, {band_boundaries_.data(), band_boundaries_.size()},
|
||||||
|
{lagged_frame_energy_coeffs_.data(), lagged_frame_energy_coeffs_.size()});
|
||||||
|
// Log of the band energies for the reference frame.
|
||||||
|
std::array<float, kNumBands> log_band_energy_coeffs;
|
||||||
|
ComputeLogBandEnergiesCoefficients(
|
||||||
|
{reference_frame_energy_coeffs_.data(),
|
||||||
|
reference_frame_energy_coeffs_.size()},
|
||||||
|
{log_band_energy_coeffs.data(), log_band_energy_coeffs.size()});
|
||||||
|
// Decorrelate band-wise log energy coefficients via DCT.
|
||||||
|
std::array<float, kNumBands> log_band_energy_coeffs_decorrelated;
|
||||||
|
ComputeDct({log_band_energy_coeffs.data(), log_band_energy_coeffs.size()},
|
||||||
|
{dct_table_.data(), dct_table_.size()},
|
||||||
|
{log_band_energy_coeffs_decorrelated.data(),
|
||||||
|
log_band_energy_coeffs_decorrelated.size()});
|
||||||
|
// Normalize (based on training set stats).
|
||||||
|
log_band_energy_coeffs_decorrelated[0] -= 12;
|
||||||
|
log_band_energy_coeffs_decorrelated[1] -= 4;
|
||||||
|
// Update the ring buffer and the spectral difference stats.
|
||||||
|
spectral_coeffs_ring_buf_.Push({log_band_energy_coeffs_decorrelated.data(),
|
||||||
|
log_band_energy_coeffs_decorrelated.size()});
|
||||||
|
UpdateSpectralDifferenceStats({log_band_energy_coeffs_decorrelated.data(),
|
||||||
|
log_band_energy_coeffs_decorrelated.size()},
|
||||||
|
spectral_coeffs_ring_buf_,
|
||||||
|
&spectral_diffs_buf_);
|
||||||
|
// Write the higher bands spectral coefficients.
|
||||||
|
auto coeffs_src = spectral_coeffs_ring_buf_.GetArrayView(0);
|
||||||
|
RTC_DCHECK_EQ(coeffs_src.size() - kNumLowerBands,
|
||||||
|
spectral_features.coeffs.size());
|
||||||
|
std::copy(coeffs_src.begin() + kNumLowerBands, coeffs_src.end(),
|
||||||
|
spectral_features.coeffs.begin());
|
||||||
|
// Compute and write remaining features.
|
||||||
|
ComputeAvgAndDerivatives(spectral_features.average,
|
||||||
|
spectral_features.first_derivative,
|
||||||
|
spectral_features.second_derivative);
|
||||||
|
ComputeCrossCorrelation(spectral_features.cross_correlations);
|
||||||
|
RTC_DCHECK(spectral_features.variability);
|
||||||
|
*(spectral_features.variability) = ComputeVariability();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void SpectralFeaturesExtractor::ComputeAvgAndDerivatives(
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> average,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> second_derivative) {
|
||||||
|
auto curr = spectral_coeffs_ring_buf_.GetArrayView(0);
|
||||||
|
auto prev1 = spectral_coeffs_ring_buf_.GetArrayView(1);
|
||||||
|
auto prev2 = spectral_coeffs_ring_buf_.GetArrayView(2);
|
||||||
|
RTC_DCHECK_EQ(average.size(), first_derivative.size());
|
||||||
|
RTC_DCHECK_EQ(first_derivative.size(), second_derivative.size());
|
||||||
|
RTC_DCHECK_LE(average.size(), curr.size());
|
||||||
|
for (size_t i = 0; i < average.size(); ++i) {
|
||||||
|
// Average, kernel: [1, 1, 1].
|
||||||
|
average[i] = curr[i] + prev1[i] + prev2[i];
|
||||||
|
// First derivative, kernel: [1, 0, - 1].
|
||||||
|
first_derivative[i] = curr[i] - prev2[i];
|
||||||
|
// Second derivative, Laplacian kernel: [1, -2, 1].
|
||||||
|
second_derivative[i] = curr[i] - 2 * prev1[i] + prev2[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SpectralFeaturesExtractor::ComputeCrossCorrelation(
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> cross_correlations) {
|
||||||
|
const auto& x = reference_frame_fft_;
|
||||||
|
const auto& y = lagged_frame_fft_;
|
||||||
|
auto cross_corr = [x, y](const size_t freq_bin_index) -> float {
|
||||||
|
return (x[freq_bin_index].real() * y[freq_bin_index].real() +
|
||||||
|
x[freq_bin_index].imag() * y[freq_bin_index].imag());
|
||||||
|
};
|
||||||
|
std::array<float, kNumBands> cross_corr_coeffs;
|
||||||
|
constexpr size_t kNumFftPoints = kFrameSize20ms24kHz / 2 + 1;
|
||||||
|
ComputeBandCoefficients(
|
||||||
|
cross_corr, {band_boundaries_.data(), band_boundaries_.size()},
|
||||||
|
kNumFftPoints - 1, {cross_corr_coeffs.data(), cross_corr_coeffs.size()});
|
||||||
|
// Normalize.
|
||||||
|
for (size_t i = 0; i < cross_corr_coeffs.size(); ++i) {
|
||||||
|
cross_corr_coeffs[i] =
|
||||||
|
cross_corr_coeffs[i] /
|
||||||
|
std::sqrt(0.001f + reference_frame_energy_coeffs_[i] *
|
||||||
|
lagged_frame_energy_coeffs_[i]);
|
||||||
|
}
|
||||||
|
// Decorrelate.
|
||||||
|
ComputeDct({cross_corr_coeffs.data(), cross_corr_coeffs.size()},
|
||||||
|
{dct_table_.data(), dct_table_.size()},
|
||||||
|
{cross_correlations.data(), cross_correlations.size()});
|
||||||
|
// Normalize (based on training set stats).
|
||||||
|
cross_correlations[0] -= 1.3f;
|
||||||
|
cross_correlations[1] -= 0.9f;
|
||||||
|
}
|
||||||
|
|
||||||
|
float SpectralFeaturesExtractor::ComputeVariability() {
|
||||||
|
// Compute spectral variability score.
|
||||||
|
float spec_variability = 0.f;
|
||||||
|
for (size_t delay1 = 0; delay1 < kSpectralCoeffsHistorySize; ++delay1) {
|
||||||
|
float min_dist = std::numeric_limits<float>::max();
|
||||||
|
for (size_t delay2 = 0; delay2 < kSpectralCoeffsHistorySize; ++delay2) {
|
||||||
|
if (delay1 == delay2) // The distance would be 0.
|
||||||
|
continue;
|
||||||
|
min_dist =
|
||||||
|
std::min(min_dist, spectral_diffs_buf_.GetValue(delay1, delay2));
|
||||||
|
}
|
||||||
|
spec_variability += min_dist;
|
||||||
|
}
|
||||||
|
// Normalize (based on training set stats).
|
||||||
|
return spec_variability / kSpectralCoeffsHistorySize - 2.1f;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace rnn_vad
|
||||||
|
} // namespace webrtc
|
92
modules/audio_processing/agc2/rnn_vad/spectral_features.h
Normal file
92
modules/audio_processing/agc2/rnn_vad/spectral_features.h
Normal file
|
@ -0,0 +1,92 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_H_
|
||||||
|
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_H_
|
||||||
|
|
||||||
|
#include <array>
|
||||||
|
#include <complex>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "api/array_view.h"
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/fft_util.h"
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/ring_buffer.h"
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/symmetric_matrix_buffer.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
namespace rnn_vad {
|
||||||
|
|
||||||
|
// View on spectral features.
|
||||||
|
class SpectralFeaturesView {
|
||||||
|
public:
|
||||||
|
SpectralFeaturesView(rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> average,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> second_derivative,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> cross_correlations,
|
||||||
|
float* variability);
|
||||||
|
SpectralFeaturesView(const SpectralFeaturesView&);
|
||||||
|
~SpectralFeaturesView();
|
||||||
|
// Higher bands spectral coefficients.
|
||||||
|
const rtc::ArrayView<float, kNumBands - kNumLowerBands> coeffs;
|
||||||
|
// Average and first and second derivative over time for the lower bands.
|
||||||
|
const rtc::ArrayView<float, kNumLowerBands> average;
|
||||||
|
const rtc::ArrayView<float, kNumLowerBands> first_derivative;
|
||||||
|
const rtc::ArrayView<float, kNumLowerBands> second_derivative;
|
||||||
|
// Spectral cross-correlation for the lower bands.
|
||||||
|
const rtc::ArrayView<float, kNumLowerBands> cross_correlations;
|
||||||
|
// Spectral variability score.
|
||||||
|
float* const variability;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Class to compute spectral features.
|
||||||
|
class SpectralFeaturesExtractor {
|
||||||
|
public:
|
||||||
|
SpectralFeaturesExtractor();
|
||||||
|
SpectralFeaturesExtractor(const SpectralFeaturesExtractor&) = delete;
|
||||||
|
SpectralFeaturesExtractor& operator=(const SpectralFeaturesExtractor&) =
|
||||||
|
delete;
|
||||||
|
~SpectralFeaturesExtractor();
|
||||||
|
// Resets the internal state of the feature extractor.
|
||||||
|
void Reset();
|
||||||
|
// Analyzes a pair of reference and lagged frames from the pitch buffer,
|
||||||
|
// detects silence and computes features. If silence is detected, the output
|
||||||
|
// is neither computed nor written.
|
||||||
|
bool CheckSilenceComputeFeatures(
|
||||||
|
rtc::ArrayView<const float, kFrameSize20ms24kHz> reference_frame,
|
||||||
|
rtc::ArrayView<const float, kFrameSize20ms24kHz> lagged_frame,
|
||||||
|
SpectralFeaturesView spectral_features);
|
||||||
|
|
||||||
|
private:
|
||||||
|
void ComputeAvgAndDerivatives(
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> average,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> first_derivative,
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> second_derivative);
|
||||||
|
void ComputeCrossCorrelation(
|
||||||
|
rtc::ArrayView<float, kNumLowerBands> cross_correlations);
|
||||||
|
float ComputeVariability();
|
||||||
|
|
||||||
|
BandAnalysisFft fft_;
|
||||||
|
std::vector<std::complex<float>> reference_frame_fft_;
|
||||||
|
std::vector<std::complex<float>> lagged_frame_fft_;
|
||||||
|
std::array<float, kNumBands> reference_frame_energy_coeffs_{};
|
||||||
|
std::array<float, kNumBands> lagged_frame_energy_coeffs_{};
|
||||||
|
const std::array<size_t, kNumBands> band_boundaries_;
|
||||||
|
const std::array<float, kNumBands * kNumBands> dct_table_;
|
||||||
|
RingBuffer<float, kNumBands, kSpectralCoeffsHistorySize>
|
||||||
|
spectral_coeffs_ring_buf_;
|
||||||
|
SymmetricMatrixBuffer<float, kSpectralCoeffsHistorySize> spectral_diffs_buf_;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace rnn_vad
|
||||||
|
} // namespace webrtc
|
||||||
|
|
||||||
|
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_SPECTRAL_FEATURES_H_
|
|
@ -14,7 +14,6 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
|
||||||
#include "rtc_base/checks.h"
|
#include "rtc_base/checks.h"
|
||||||
#include "rtc_base/function_view.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
namespace webrtc {
|
||||||
namespace rnn_vad {
|
namespace rnn_vad {
|
||||||
|
@ -23,11 +22,19 @@ namespace {
|
||||||
// DCT scaling factor.
|
// DCT scaling factor.
|
||||||
const float kDctScalingFactor = std::sqrt(2.f / kNumBands);
|
const float kDctScalingFactor = std::sqrt(2.f / kNumBands);
|
||||||
|
|
||||||
// Iterates through frequency bands and computes coefficients via |functor| for
|
} // namespace
|
||||||
// triangular bands with peak response at each band boundary. |functor| returns
|
|
||||||
// a floating point value for the FFT coefficient having index equal to the
|
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
|
||||||
// argument passed to |functor|; that argument is in the range {0, ...
|
size_t sample_rate_hz,
|
||||||
// |max_freq_bin_index| - 1}.
|
size_t frame_size_samples) {
|
||||||
|
std::array<size_t, kNumBands> indexes;
|
||||||
|
for (size_t i = 0; i < kNumBands; ++i) {
|
||||||
|
indexes[i] =
|
||||||
|
kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz;
|
||||||
|
}
|
||||||
|
return indexes;
|
||||||
|
}
|
||||||
|
|
||||||
void ComputeBandCoefficients(
|
void ComputeBandCoefficients(
|
||||||
rtc::FunctionView<float(size_t)> functor,
|
rtc::FunctionView<float(size_t)> functor,
|
||||||
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||||
|
@ -64,25 +71,12 @@ void ComputeBandCoefficients(
|
||||||
// (*): "size_t i" must be declared before the main loop.
|
// (*): "size_t i" must be declared before the main loop.
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
|
|
||||||
size_t sample_rate_hz,
|
|
||||||
size_t frame_size_samples) {
|
|
||||||
std::array<size_t, kNumBands> indexes;
|
|
||||||
for (size_t i = 0; i < kNumBands; ++i) {
|
|
||||||
indexes[i] =
|
|
||||||
kBandFrequencyBoundaries[i] * frame_size_samples / sample_rate_hz;
|
|
||||||
}
|
|
||||||
return indexes;
|
|
||||||
}
|
|
||||||
|
|
||||||
void ComputeBandEnergies(
|
void ComputeBandEnergies(
|
||||||
rtc::ArrayView<const std::complex<float>> fft_coeffs,
|
rtc::ArrayView<const std::complex<float>> fft_coeffs,
|
||||||
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||||
rtc::ArrayView<float, kNumBands> band_energies) {
|
rtc::ArrayView<float, kNumBands> band_energies) {
|
||||||
RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size());
|
RTC_DCHECK_EQ(band_boundaries.size(), band_energies.size());
|
||||||
auto functor = [fft_coeffs](const size_t freq_bin_index) {
|
auto functor = [fft_coeffs](const size_t freq_bin_index) -> float {
|
||||||
return std::norm(fft_coeffs[freq_bin_index]);
|
return std::norm(fft_coeffs[freq_bin_index]);
|
||||||
};
|
};
|
||||||
ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1,
|
ComputeBandCoefficients(functor, band_boundaries, fft_coeffs.size() - 1,
|
||||||
|
|
|
@ -16,6 +16,7 @@
|
||||||
|
|
||||||
#include "api/array_view.h"
|
#include "api/array_view.h"
|
||||||
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
||||||
|
#include "rtc_base/function_view.h"
|
||||||
|
|
||||||
namespace webrtc {
|
namespace webrtc {
|
||||||
namespace rnn_vad {
|
namespace rnn_vad {
|
||||||
|
@ -25,6 +26,17 @@ std::array<size_t, kNumBands> ComputeBandBoundaryIndexes(
|
||||||
size_t sample_rate_hz,
|
size_t sample_rate_hz,
|
||||||
size_t frame_size_samples);
|
size_t frame_size_samples);
|
||||||
|
|
||||||
|
// Iterates through frequency bands and computes coefficients via |functor| for
|
||||||
|
// triangular bands with peak response at each band boundary. |functor| returns
|
||||||
|
// a floating point value for the FFT coefficient having index equal to the
|
||||||
|
// argument passed to |functor|; that argument is in the range {0, ...
|
||||||
|
// |max_freq_bin_index| - 1}.
|
||||||
|
void ComputeBandCoefficients(
|
||||||
|
rtc::FunctionView<float(size_t)> functor,
|
||||||
|
rtc::ArrayView<const size_t, kNumBands> band_boundaries,
|
||||||
|
const size_t max_freq_bin_index,
|
||||||
|
rtc::ArrayView<float, kNumBands> coefficients);
|
||||||
|
|
||||||
// Given an array of FFT coefficients and a vector of band boundary indexes,
|
// Given an array of FFT coefficients and a vector of band boundary indexes,
|
||||||
// computes band energy coefficients.
|
// computes band energy coefficients.
|
||||||
void ComputeBandEnergies(
|
void ComputeBandEnergies(
|
||||||
|
|
|
@ -0,0 +1,129 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* Use of this source code is governed by a BSD-style license
|
||||||
|
* that can be found in the LICENSE file in the root of the source
|
||||||
|
* tree. An additional intellectual property rights grant can be found
|
||||||
|
* in the file PATENTS. All contributing project authors may
|
||||||
|
* be found in the AUTHORS file in the root of the source tree.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/spectral_features.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
|
#include "modules/audio_processing/agc2/rnn_vad/test_utils.h"
|
||||||
|
#include "rtc_base/checks.h"
|
||||||
|
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
|
||||||
|
// #include "test/fpe_observer.h"
|
||||||
|
#include "test/gtest.h"
|
||||||
|
|
||||||
|
namespace webrtc {
|
||||||
|
namespace rnn_vad {
|
||||||
|
namespace test {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
constexpr size_t kTestFeatureVectorSize = kNumBands + 3 * kNumLowerBands + 1;
|
||||||
|
|
||||||
|
// Writes non-zero sample values.
|
||||||
|
void WriteTestData(rtc::ArrayView<float> samples) {
|
||||||
|
for (size_t i = 0; i < samples.size(); ++i) {
|
||||||
|
samples[i] = i % 100;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SpectralFeaturesView GetSpectralFeaturesView(
|
||||||
|
std::array<float, kTestFeatureVectorSize>* feature_vector) {
|
||||||
|
return {
|
||||||
|
{feature_vector->data() + kNumLowerBands, kNumBands - kNumLowerBands},
|
||||||
|
{feature_vector->data(), kNumLowerBands},
|
||||||
|
{feature_vector->data() + kNumBands, kNumLowerBands},
|
||||||
|
{feature_vector->data() + kNumBands + kNumLowerBands, kNumLowerBands},
|
||||||
|
{feature_vector->data() + kNumBands + 2 * kNumLowerBands, kNumLowerBands},
|
||||||
|
&(*feature_vector)[kNumBands + 3 * kNumLowerBands]};
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr float kInitialFeatureVal = -9999.f;
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
TEST(RnnVadTest, SpectralFeaturesWithAndWithoutSilence) {
|
||||||
|
// Initialize.
|
||||||
|
SpectralFeaturesExtractor sfe;
|
||||||
|
std::array<float, kFrameSize20ms24kHz> samples;
|
||||||
|
rtc::ArrayView<float, kFrameSize20ms24kHz> samples_view(samples.data(),
|
||||||
|
samples.size());
|
||||||
|
bool is_silence;
|
||||||
|
std::array<float, kTestFeatureVectorSize> feature_vector;
|
||||||
|
auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
|
||||||
|
|
||||||
|
// Write an initial value in the feature vector to detect changes.
|
||||||
|
std::fill(feature_vector.begin(), feature_vector.end(), kInitialFeatureVal);
|
||||||
|
|
||||||
|
// TODO(bugs.webrtc.org/8948): Add when the issue is fixed.
|
||||||
|
// FloatingPointExceptionObserver fpe_observer;
|
||||||
|
|
||||||
|
// With silence.
|
||||||
|
std::fill(samples.begin(), samples.end(), 0.f);
|
||||||
|
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||||
|
feature_vector_view);
|
||||||
|
// Silence is expected, the output won't be overwritten.
|
||||||
|
EXPECT_TRUE(is_silence);
|
||||||
|
EXPECT_TRUE(std::all_of(feature_vector.begin(), feature_vector.end(),
|
||||||
|
[](float x) { return x == kInitialFeatureVal; }));
|
||||||
|
|
||||||
|
// With no silence.
|
||||||
|
WriteTestData(samples);
|
||||||
|
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||||
|
feature_vector_view);
|
||||||
|
// Silence is not expected, the output will be overwritten.
|
||||||
|
EXPECT_FALSE(is_silence);
|
||||||
|
EXPECT_FALSE(std::all_of(feature_vector.begin(), feature_vector.end(),
|
||||||
|
[](float x) { return x == kInitialFeatureVal; }));
|
||||||
|
}
|
||||||
|
|
||||||
|
// When the input signal does not change, the spectral coefficients average does
|
||||||
|
// not change and the derivatives are zero. Similarly, the spectral variability
|
||||||
|
// score does not change either.
|
||||||
|
TEST(RnnVadTest, SpectralFeaturesConstantAverageZeroDerivative) {
|
||||||
|
// Initialize.
|
||||||
|
SpectralFeaturesExtractor sfe;
|
||||||
|
std::array<float, kFrameSize20ms24kHz> samples;
|
||||||
|
rtc::ArrayView<float, kFrameSize20ms24kHz> samples_view(samples.data(),
|
||||||
|
samples.size());
|
||||||
|
WriteTestData(samples);
|
||||||
|
bool is_silence;
|
||||||
|
|
||||||
|
// Fill the spectral features with test data.
|
||||||
|
std::array<float, kTestFeatureVectorSize> feature_vector;
|
||||||
|
auto feature_vector_view = GetSpectralFeaturesView(&feature_vector);
|
||||||
|
for (size_t i = 0; i < kSpectralCoeffsHistorySize; ++i) {
|
||||||
|
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||||
|
feature_vector_view);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Feed the test data one last time but using a different output vector.
|
||||||
|
std::array<float, kTestFeatureVectorSize> feature_vector_last;
|
||||||
|
auto feature_vector_last_view = GetSpectralFeaturesView(&feature_vector_last);
|
||||||
|
is_silence = sfe.CheckSilenceComputeFeatures(samples_view, samples_view,
|
||||||
|
feature_vector_last_view);
|
||||||
|
|
||||||
|
// Average is unchanged.
|
||||||
|
ExpectEqualFloatArray({feature_vector.data(), kNumLowerBands},
|
||||||
|
{feature_vector_last.data(), kNumLowerBands});
|
||||||
|
// First and second derivatives are zero.
|
||||||
|
constexpr std::array<float, kNumLowerBands> zeros{};
|
||||||
|
ExpectEqualFloatArray(
|
||||||
|
{feature_vector_last.data() + kNumBands, kNumLowerBands},
|
||||||
|
{zeros.data(), zeros.size()});
|
||||||
|
ExpectEqualFloatArray(
|
||||||
|
{feature_vector_last.data() + kNumBands + kNumLowerBands, kNumLowerBands},
|
||||||
|
{zeros.data(), zeros.size()});
|
||||||
|
// Spectral variability is unchanged.
|
||||||
|
EXPECT_FLOAT_EQ(feature_vector[kNumBands + 3 * kNumLowerBands],
|
||||||
|
feature_vector_last[kNumBands + 3 * kNumLowerBands]);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace test
|
||||||
|
} // namespace rnn_vad
|
||||||
|
} // namespace webrtc
|
Loading…
Reference in a new issue