mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-15 23:01:21 +01:00

During pitch search in the RNN VAD, we calculate auto correlation. Before this CL, we computed kNumInvertedLags12kHz=147 dot products of vectors with kBufSize12kHz-kMaxPitch12kHz=240 elements. This was the most time consuming step of the new VAD. This CL makes the computation happen in frequency domain. Profiling shows a 3x speed increase. In future, we can try using a more efficient FFT and to reduce the FFT length to some of e.g. 400, 405, 432. # For minimal Clang plugin check change. TBR: kwiberg@webrtc.org Bug: webrtc:9076 Change-Id: I688251a415869d53175a37f390f441d4e035d954 Reviewed-on: https://webrtc-review.googlesource.com/73366 Reviewed-by: Karl Wiberg <kwiberg@webrtc.org> Reviewed-by: Alessio Bazzica <alessiob@webrtc.org> Commit-Queue: Alex Loiko <aleloi@webrtc.org> Cr-Commit-Position: refs/heads/master@{#23171}
107 lines
4.6 KiB
C++
107 lines
4.6 KiB
C++
/*
|
|
* Copyright (c) 2018 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#ifndef MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
|
|
#define MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
|
|
|
|
#include <array>
|
|
|
|
#include "api/array_view.h"
|
|
#include "common_audio/real_fourier.h"
|
|
#include "modules/audio_processing/agc2/rnn_vad/common.h"
|
|
#include "modules/audio_processing/agc2/rnn_vad/pitch_info.h"
|
|
|
|
namespace webrtc {
|
|
namespace rnn_vad {
|
|
|
|
// The inverted lags for the pitch interval [|kInitialMinPitch12kHz|,
|
|
// |kMaxPitch12kHz|] are in the range [0, |kNumInvertedLags|].
|
|
static_assert(kMaxPitch12kHz > kInitialMinPitch12kHz, "");
|
|
static_assert(kMaxPitch24kHz > kInitialMinPitch24kHz, "");
|
|
constexpr size_t kNumInvertedLags12kHz = kMaxPitch12kHz - kInitialMinPitch12kHz;
|
|
constexpr size_t kNumInvertedLags24kHz = kMaxPitch24kHz - kInitialMinPitch24kHz;
|
|
constexpr int kAutoCorrelationFftOrder = 9; // Length-512 FFT.
|
|
|
|
static_assert(1 << kAutoCorrelationFftOrder >
|
|
kNumInvertedLags12kHz + kBufSize12kHz - kMaxPitch12kHz,
|
|
"");
|
|
|
|
// Performs 2x decimation without any anti-aliasing filter.
|
|
void Decimate2x(rtc::ArrayView<const float, kBufSize24kHz> src,
|
|
rtc::ArrayView<float, kBufSize12kHz> dst);
|
|
|
|
// Computes a gain threshold for a candidate pitch period given the initial and
|
|
// the previous pitch period and gain estimates and the pitch period ratio used
|
|
// to derive the candidate pitch period from the initial period.
|
|
float ComputePitchGainThreshold(size_t candidate_pitch_period,
|
|
size_t pitch_period_ratio,
|
|
size_t initial_pitch_period,
|
|
float initial_pitch_gain,
|
|
size_t prev_pitch_period,
|
|
size_t prev_pitch_gain);
|
|
|
|
// Computes the sum of squared samples for every sliding frame in the pitch
|
|
// buffer. |yy_values| indexes are lags.
|
|
//
|
|
// The pitch buffer is structured as depicted below:
|
|
// |.........|...........|
|
|
// a b
|
|
// The part on the left, named "a" contains the oldest samples, whereas "b" the
|
|
// most recent ones. The size of "a" corresponds to the maximum pitch period,
|
|
// that of "b" to the frame size (e.g., 16 ms and 20 ms respectively).
|
|
void ComputeSlidingFrameSquareEnergies(
|
|
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
|
|
rtc::ArrayView<float, kMaxPitch24kHz + 1> yy_values);
|
|
|
|
// Computes the auto-correlation coefficients for a given pitch interval.
|
|
// |auto_corr| indexes are inverted lags.
|
|
//
|
|
// The auto-correlations coefficients are computed as follows:
|
|
// |.........|...........| <- pitch buffer
|
|
// [ x (fixed) ]
|
|
// [ y_0 ]
|
|
// [ y_{m-1} ]
|
|
// x and y are sub-array of equal length; x is never moved, whereas y slides.
|
|
// The cross-correlation between y_0 and x corresponds to the auto-correlation
|
|
// for the maximum pitch period. Hence, the first value in |auto_corr| has an
|
|
// inverted lag equal to 0 that corresponds to a lag equal to the maximum pitch
|
|
// period.
|
|
void ComputePitchAutoCorrelation(
|
|
rtc::ArrayView<const float, kBufSize12kHz> pitch_buf,
|
|
size_t max_pitch_period,
|
|
rtc::ArrayView<float, kNumInvertedLags12kHz> auto_corr,
|
|
webrtc::RealFourier* fft);
|
|
|
|
// Given the auto-correlation coefficients stored according to
|
|
// ComputePitchAutoCorrelation() (i.e., using inverted lags), returns the best
|
|
// and the second best pitch periods.
|
|
std::array<size_t, 2> FindBestPitchPeriods(
|
|
rtc::ArrayView<const float> auto_corr,
|
|
rtc::ArrayView<const float> pitch_buf,
|
|
size_t max_pitch_period);
|
|
|
|
// Refines the pitch period estimation given the pitch buffer |pitch_buf| and
|
|
// the initial pitch period estimation |inv_lags|. Returns an inverted lag at
|
|
// 48 kHz.
|
|
size_t RefinePitchPeriod48kHz(
|
|
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
|
|
rtc::ArrayView<const size_t, 2> inv_lags);
|
|
|
|
// Refines the pitch period estimation and compute the pitch gain. Returns the
|
|
// refined pitch estimation data at 48 kHz.
|
|
PitchInfo CheckLowerPitchPeriodsAndComputePitchGain(
|
|
rtc::ArrayView<const float, kBufSize24kHz> pitch_buf,
|
|
size_t initial_pitch_period_48kHz,
|
|
PitchInfo prev_pitch_48kHz);
|
|
|
|
} // namespace rnn_vad
|
|
} // namespace webrtc
|
|
|
|
#endif // MODULES_AUDIO_PROCESSING_AGC2_RNN_VAD_PITCH_SEARCH_INTERNAL_H_
|