mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-13 05:40:42 +01:00
Reland "Remove post-decode VAD"
This is a reland of commit 89cf26f1e0
Original change's description:
> Remove post-decode VAD
>
> Bug: webrtc:15806
> Change-Id: I6acf8734a70703085cfc1ccf82a79ee0931f59a4
> Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/336460
> Reviewed-by: Sam Zackrisson <saza@webrtc.org>
> Commit-Queue: Tomas Lundqvist <tomasl@google.com>
> Reviewed-by: Jakob Ivarsson <jakobi@webrtc.org>
> Cr-Commit-Position: refs/heads/main@{#41653}
Bug: webrtc:15806
Change-Id: I1c2c0ce568c3c1817ff5c65bee91b9f961d46559
Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/337442
Reviewed-by: Jakob Ivarsson <jakobi@webrtc.org>
Commit-Queue: Tomas Lundqvist <tomasl@google.com>
Reviewed-by: Sam Zackrisson <saza@webrtc.org>
Cr-Commit-Position: refs/heads/main@{#41688}
This commit is contained in:
parent
39ac25d6ec
commit
aaa123debb
13 changed files with 34 additions and 370 deletions
|
@ -24,8 +24,7 @@ NetEq::Config& NetEq::Config::operator=(Config&&) = default;
|
||||||
std::string NetEq::Config::ToString() const {
|
std::string NetEq::Config::ToString() const {
|
||||||
char buf[1024];
|
char buf[1024];
|
||||||
rtc::SimpleStringBuilder ss(buf);
|
rtc::SimpleStringBuilder ss(buf);
|
||||||
ss << "sample_rate_hz=" << sample_rate_hz << ", enable_post_decode_vad="
|
ss << "sample_rate_hz=" << sample_rate_hz
|
||||||
<< (enable_post_decode_vad ? "true" : "false")
|
|
||||||
<< ", max_packets_in_buffer=" << max_packets_in_buffer
|
<< ", max_packets_in_buffer=" << max_packets_in_buffer
|
||||||
<< ", min_delay_ms=" << min_delay_ms << ", enable_fast_accelerate="
|
<< ", min_delay_ms=" << min_delay_ms << ", enable_fast_accelerate="
|
||||||
<< (enable_fast_accelerate ? "true" : "false")
|
<< (enable_fast_accelerate ? "true" : "false")
|
||||||
|
|
|
@ -130,7 +130,6 @@ class NetEq {
|
||||||
std::string ToString() const;
|
std::string ToString() const;
|
||||||
|
|
||||||
int sample_rate_hz = 48000; // Initial value. Will change with input data.
|
int sample_rate_hz = 48000; // Initial value. Will change with input data.
|
||||||
bool enable_post_decode_vad = false;
|
|
||||||
size_t max_packets_in_buffer = 200;
|
size_t max_packets_in_buffer = 200;
|
||||||
int max_delay_ms = 0;
|
int max_delay_ms = 0;
|
||||||
int min_delay_ms = 0;
|
int min_delay_ms = 0;
|
||||||
|
@ -197,18 +196,17 @@ class NetEq {
|
||||||
|
|
||||||
// Instructs NetEq to deliver 10 ms of audio data. The data is written to
|
// Instructs NetEq to deliver 10 ms of audio data. The data is written to
|
||||||
// `audio_frame`. All data in `audio_frame` is wiped; `data_`, `speech_type_`,
|
// `audio_frame`. All data in `audio_frame` is wiped; `data_`, `speech_type_`,
|
||||||
// `num_channels_`, `sample_rate_hz_`, `samples_per_channel_`, and
|
// `num_channels_`, `sample_rate_hz_` and `samples_per_channel_` are updated
|
||||||
// `vad_activity_` are updated upon success. If an error is returned, some
|
// upon success. If an error is returned, some fields may not have been
|
||||||
// fields may not have been updated, or may contain inconsistent values.
|
// updated, or may contain inconsistent values. If muted state is enabled
|
||||||
// If muted state is enabled (through Config::enable_muted_state), `muted`
|
// (through Config::enable_muted_state), `muted` may be set to true after a
|
||||||
// may be set to true after a prolonged expand period. When this happens, the
|
// prolonged expand period. When this happens, the `data_` in `audio_frame`
|
||||||
// `data_` in `audio_frame` is not written, but should be interpreted as being
|
// is not written, but should be interpreted as being all zeros. For testing
|
||||||
// all zeros. For testing purposes, an override can be supplied in the
|
// purposes, an override can be supplied in the `action_override` argument,
|
||||||
// `action_override` argument, which will cause NetEq to take this action
|
// which will cause NetEq to take this action next, instead of the action it
|
||||||
// next, instead of the action it would normally choose. An optional output
|
// would normally choose. An optional output argument for fetching the current
|
||||||
// argument for fetching the current sample rate can be provided, which
|
// sample rate can be provided, which will return the same value as
|
||||||
// will return the same value as last_output_sample_rate_hz() but will avoid
|
// last_output_sample_rate_hz() but will avoid additional synchronization.
|
||||||
// additional synchronization.
|
|
||||||
// Returns kOK on success, or kFail in case of an error.
|
// Returns kOK on success, or kFail in case of an error.
|
||||||
virtual int GetAudio(
|
virtual int GetAudio(
|
||||||
AudioFrame* audio_frame,
|
AudioFrame* audio_frame,
|
||||||
|
@ -278,13 +276,6 @@ class NetEq {
|
||||||
// statistics are never reset.
|
// statistics are never reset.
|
||||||
virtual NetEqOperationsAndState GetOperationsAndState() const = 0;
|
virtual NetEqOperationsAndState GetOperationsAndState() const = 0;
|
||||||
|
|
||||||
// Enables post-decode VAD. When enabled, GetAudio() will return
|
|
||||||
// kOutputVADPassive when the signal contains no speech.
|
|
||||||
virtual void EnableVad() {}
|
|
||||||
|
|
||||||
// Disables post-decode VAD.
|
|
||||||
virtual void DisableVad() {}
|
|
||||||
|
|
||||||
// Returns the RTP timestamp for the last sample delivered by GetAudio().
|
// Returns the RTP timestamp for the last sample delivered by GetAudio().
|
||||||
// The return value will be empty if no valid timestamp is available.
|
// The return value will be empty if no valid timestamp is available.
|
||||||
virtual absl::optional<uint32_t> GetPlayoutTimestamp() const = 0;
|
virtual absl::optional<uint32_t> GetPlayoutTimestamp() const = 0;
|
||||||
|
|
|
@ -689,8 +689,6 @@ rtc_library("neteq") {
|
||||||
"neteq/packet_arrival_history.h",
|
"neteq/packet_arrival_history.h",
|
||||||
"neteq/packet_buffer.cc",
|
"neteq/packet_buffer.cc",
|
||||||
"neteq/packet_buffer.h",
|
"neteq/packet_buffer.h",
|
||||||
"neteq/post_decode_vad.cc",
|
|
||||||
"neteq/post_decode_vad.h",
|
|
||||||
"neteq/preemptive_expand.cc",
|
"neteq/preemptive_expand.cc",
|
||||||
"neteq/preemptive_expand.h",
|
"neteq/preemptive_expand.h",
|
||||||
"neteq/random_vector.cc",
|
"neteq/random_vector.cc",
|
||||||
|
@ -1655,7 +1653,6 @@ if (rtc_include_tests) {
|
||||||
"neteq/normal_unittest.cc",
|
"neteq/normal_unittest.cc",
|
||||||
"neteq/packet_arrival_history_unittest.cc",
|
"neteq/packet_arrival_history_unittest.cc",
|
||||||
"neteq/packet_buffer_unittest.cc",
|
"neteq/packet_buffer_unittest.cc",
|
||||||
"neteq/post_decode_vad_unittest.cc",
|
|
||||||
"neteq/random_vector_unittest.cc",
|
"neteq/random_vector_unittest.cc",
|
||||||
"neteq/red_payload_splitter_unittest.cc",
|
"neteq/red_payload_splitter_unittest.cc",
|
||||||
"neteq/reorder_optimizer_unittest.cc",
|
"neteq/reorder_optimizer_unittest.cc",
|
||||||
|
|
|
@ -50,11 +50,7 @@ std::unique_ptr<NetEq> CreateNetEq(
|
||||||
|
|
||||||
AcmReceiver::Config::Config(
|
AcmReceiver::Config::Config(
|
||||||
rtc::scoped_refptr<AudioDecoderFactory> decoder_factory)
|
rtc::scoped_refptr<AudioDecoderFactory> decoder_factory)
|
||||||
: clock(*Clock::GetRealTimeClock()), decoder_factory(decoder_factory) {
|
: clock(*Clock::GetRealTimeClock()), decoder_factory(decoder_factory) {}
|
||||||
// Post-decode VAD is disabled by default in NetEq, however, Audio
|
|
||||||
// Conference Mixer relies on VAD decisions and fails without them.
|
|
||||||
neteq_config.enable_post_decode_vad = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
AcmReceiver::Config::Config(const Config&) = default;
|
AcmReceiver::Config::Config(const Config&) = default;
|
||||||
AcmReceiver::Config::~Config() = default;
|
AcmReceiver::Config::~Config() = default;
|
||||||
|
|
|
@ -190,9 +190,6 @@ class AcmReceiverTestFaxModeOldApi : public AcmReceiverTestOldApi {
|
||||||
const size_t output_channels = info.num_channels;
|
const size_t output_channels = info.num_channels;
|
||||||
const size_t samples_per_ms = rtc::checked_cast<size_t>(
|
const size_t samples_per_ms = rtc::checked_cast<size_t>(
|
||||||
rtc::CheckedDivExact(output_sample_rate_hz, 1000));
|
rtc::CheckedDivExact(output_sample_rate_hz, 1000));
|
||||||
const AudioFrame::VADActivity expected_vad_activity =
|
|
||||||
output_sample_rate_hz > 16000 ? AudioFrame::kVadActive
|
|
||||||
: AudioFrame::kVadPassive;
|
|
||||||
|
|
||||||
// Expect the first output timestamp to be 5*fs/8000 samples before the
|
// Expect the first output timestamp to be 5*fs/8000 samples before the
|
||||||
// first inserted timestamp (because of NetEq's look-ahead). (This value is
|
// first inserted timestamp (because of NetEq's look-ahead). (This value is
|
||||||
|
@ -217,7 +214,6 @@ class AcmReceiverTestFaxModeOldApi : public AcmReceiverTestOldApi {
|
||||||
EXPECT_EQ(output_sample_rate_hz, frame.sample_rate_hz_);
|
EXPECT_EQ(output_sample_rate_hz, frame.sample_rate_hz_);
|
||||||
EXPECT_EQ(output_channels, frame.num_channels_);
|
EXPECT_EQ(output_channels, frame.num_channels_);
|
||||||
EXPECT_EQ(AudioFrame::kNormalSpeech, frame.speech_type_);
|
EXPECT_EQ(AudioFrame::kNormalSpeech, frame.speech_type_);
|
||||||
EXPECT_EQ(expected_vad_activity, frame.vad_activity_);
|
|
||||||
EXPECT_FALSE(muted);
|
EXPECT_FALSE(muted);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -242,61 +238,6 @@ TEST_F(AcmReceiverTestFaxModeOldApi, MAYBE_VerifyAudioFrameOpus) {
|
||||||
RunVerifyAudioFrame({"opus", 48000, 2});
|
RunVerifyAudioFrame({"opus", 48000, 2});
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(WEBRTC_ANDROID)
|
|
||||||
#define MAYBE_PostdecodingVad DISABLED_PostdecodingVad
|
|
||||||
#else
|
|
||||||
#define MAYBE_PostdecodingVad PostdecodingVad
|
|
||||||
#endif
|
|
||||||
TEST_F(AcmReceiverTestOldApi, MAYBE_PostdecodingVad) {
|
|
||||||
EXPECT_TRUE(config_.neteq_config.enable_post_decode_vad);
|
|
||||||
constexpr int payload_type = 34;
|
|
||||||
const SdpAudioFormat codec = {"L16", 16000, 1};
|
|
||||||
const AudioCodecInfo info = SetEncoder(payload_type, codec);
|
|
||||||
receiver_->SetCodecs({{payload_type, codec}});
|
|
||||||
constexpr int kNumPackets = 5;
|
|
||||||
AudioFrame frame;
|
|
||||||
for (int n = 0; n < kNumPackets; ++n) {
|
|
||||||
const int num_10ms_frames = InsertOnePacketOfSilence(info);
|
|
||||||
for (int k = 0; k < num_10ms_frames; ++k) {
|
|
||||||
bool muted;
|
|
||||||
ASSERT_EQ(0, receiver_->GetAudio(info.sample_rate_hz, &frame, &muted));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
EXPECT_EQ(AudioFrame::kVadPassive, frame.vad_activity_);
|
|
||||||
}
|
|
||||||
|
|
||||||
class AcmReceiverTestPostDecodeVadPassiveOldApi : public AcmReceiverTestOldApi {
|
|
||||||
protected:
|
|
||||||
AcmReceiverTestPostDecodeVadPassiveOldApi() {
|
|
||||||
config_.neteq_config.enable_post_decode_vad = false;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(WEBRTC_ANDROID)
|
|
||||||
#define MAYBE_PostdecodingVad DISABLED_PostdecodingVad
|
|
||||||
#else
|
|
||||||
#define MAYBE_PostdecodingVad PostdecodingVad
|
|
||||||
#endif
|
|
||||||
TEST_F(AcmReceiverTestPostDecodeVadPassiveOldApi, MAYBE_PostdecodingVad) {
|
|
||||||
EXPECT_FALSE(config_.neteq_config.enable_post_decode_vad);
|
|
||||||
constexpr int payload_type = 34;
|
|
||||||
const SdpAudioFormat codec = {"L16", 16000, 1};
|
|
||||||
const AudioCodecInfo info = SetEncoder(payload_type, codec);
|
|
||||||
auto const value = encoder_factory_->QueryAudioEncoder(codec);
|
|
||||||
ASSERT_TRUE(value.has_value());
|
|
||||||
receiver_->SetCodecs({{payload_type, codec}});
|
|
||||||
const int kNumPackets = 5;
|
|
||||||
AudioFrame frame;
|
|
||||||
for (int n = 0; n < kNumPackets; ++n) {
|
|
||||||
const int num_10ms_frames = InsertOnePacketOfSilence(info);
|
|
||||||
for (int k = 0; k < num_10ms_frames; ++k) {
|
|
||||||
bool muted;
|
|
||||||
ASSERT_EQ(0, receiver_->GetAudio(info.sample_rate_hz, &frame, &muted));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
EXPECT_EQ(AudioFrame::kVadUnknown, frame.vad_activity_);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if defined(WEBRTC_ANDROID)
|
#if defined(WEBRTC_ANDROID)
|
||||||
#define MAYBE_LastAudioCodec DISABLED_LastAudioCodec
|
#define MAYBE_LastAudioCodec DISABLED_LastAudioCodec
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -17,7 +17,6 @@
|
||||||
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
#include "common_audio/signal_processing/include/signal_processing_library.h"
|
||||||
#include "modules/audio_coding/neteq/audio_multi_vector.h"
|
#include "modules/audio_coding/neteq/audio_multi_vector.h"
|
||||||
#include "modules/audio_coding/neteq/cross_correlation.h"
|
#include "modules/audio_coding/neteq/cross_correlation.h"
|
||||||
#include "modules/audio_coding/neteq/post_decode_vad.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
namespace webrtc {
|
||||||
namespace {
|
namespace {
|
||||||
|
@ -44,17 +43,11 @@ void BackgroundNoise::Reset() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool BackgroundNoise::Update(const AudioMultiVector& input,
|
bool BackgroundNoise::Update(const AudioMultiVector& sync_buffer) {
|
||||||
const PostDecodeVad& vad) {
|
|
||||||
bool filter_params_saved = false;
|
bool filter_params_saved = false;
|
||||||
if (vad.running() && vad.active_speech()) {
|
|
||||||
// Do not update the background noise parameters if we know that the signal
|
|
||||||
// is active speech.
|
|
||||||
return filter_params_saved;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t auto_correlation[kMaxLpcOrder + 1];
|
int32_t auto_correlation[kMaxLpcOrder + 1];
|
||||||
int16_t fiter_output[kMaxLpcOrder + kResidualLength];
|
int16_t filter_output[kMaxLpcOrder + kResidualLength];
|
||||||
int16_t reflection_coefficients[kMaxLpcOrder];
|
int16_t reflection_coefficients[kMaxLpcOrder];
|
||||||
int16_t lpc_coefficients[kMaxLpcOrder + 1];
|
int16_t lpc_coefficients[kMaxLpcOrder + 1];
|
||||||
|
|
||||||
|
@ -62,14 +55,13 @@ bool BackgroundNoise::Update(const AudioMultiVector& input,
|
||||||
ChannelParameters& parameters = channel_parameters_[channel_ix];
|
ChannelParameters& parameters = channel_parameters_[channel_ix];
|
||||||
int16_t temp_signal_array[kVecLen + kMaxLpcOrder] = {0};
|
int16_t temp_signal_array[kVecLen + kMaxLpcOrder] = {0};
|
||||||
int16_t* temp_signal = &temp_signal_array[kMaxLpcOrder];
|
int16_t* temp_signal = &temp_signal_array[kMaxLpcOrder];
|
||||||
RTC_DCHECK_GE(input.Size(), kVecLen);
|
RTC_DCHECK_GE(sync_buffer.Size(), kVecLen);
|
||||||
input[channel_ix].CopyTo(kVecLen, input.Size() - kVecLen, temp_signal);
|
sync_buffer[channel_ix].CopyTo(kVecLen, sync_buffer.Size() - kVecLen,
|
||||||
|
temp_signal);
|
||||||
int32_t sample_energy =
|
int32_t sample_energy =
|
||||||
CalculateAutoCorrelation(temp_signal, kVecLen, auto_correlation);
|
CalculateAutoCorrelation(temp_signal, kVecLen, auto_correlation);
|
||||||
|
|
||||||
if ((!vad.running() &&
|
if (sample_energy < parameters.energy_update_threshold) {
|
||||||
sample_energy < parameters.energy_update_threshold) ||
|
|
||||||
(vad.running() && !vad.active_speech())) {
|
|
||||||
// Generate LPC coefficients.
|
// Generate LPC coefficients.
|
||||||
if (auto_correlation[0] <= 0) {
|
if (auto_correlation[0] <= 0) {
|
||||||
// Center value in auto-correlation is not positive. Do not update.
|
// Center value in auto-correlation is not positive. Do not update.
|
||||||
|
@ -95,10 +87,10 @@ bool BackgroundNoise::Update(const AudioMultiVector& input,
|
||||||
|
|
||||||
// Generate the CNG gain factor by looking at the energy of the residual.
|
// Generate the CNG gain factor by looking at the energy of the residual.
|
||||||
WebRtcSpl_FilterMAFastQ12(temp_signal + kVecLen - kResidualLength,
|
WebRtcSpl_FilterMAFastQ12(temp_signal + kVecLen - kResidualLength,
|
||||||
fiter_output, lpc_coefficients,
|
filter_output, lpc_coefficients,
|
||||||
kMaxLpcOrder + 1, kResidualLength);
|
kMaxLpcOrder + 1, kResidualLength);
|
||||||
int32_t residual_energy = WebRtcSpl_DotProductWithScale(
|
int32_t residual_energy = WebRtcSpl_DotProductWithScale(
|
||||||
fiter_output, fiter_output, kResidualLength, 0);
|
filter_output, filter_output, kResidualLength, 0);
|
||||||
|
|
||||||
// Check spectral flatness.
|
// Check spectral flatness.
|
||||||
// Comparing the residual variance with the input signal variance tells
|
// Comparing the residual variance with the input signal variance tells
|
||||||
|
@ -117,9 +109,8 @@ bool BackgroundNoise::Update(const AudioMultiVector& input,
|
||||||
filter_params_saved = true;
|
filter_params_saved = true;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Will only happen if post-decode VAD is disabled and `sample_energy` is
|
// Will only happen if `sample_energy` is not low enough. Increase the
|
||||||
// not low enough. Increase the threshold for update so that it increases
|
// threshold for update so that it increases by a factor 4 in 4 seconds.
|
||||||
// by a factor 4 in 4 seconds.
|
|
||||||
IncrementEnergyThreshold(channel_ix, sample_energy);
|
IncrementEnergyThreshold(channel_ix, sample_energy);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,9 +39,9 @@ class BackgroundNoise {
|
||||||
void Reset();
|
void Reset();
|
||||||
|
|
||||||
// Updates the parameter estimates based on the signal currently in the
|
// Updates the parameter estimates based on the signal currently in the
|
||||||
// `sync_buffer`, and on the latest decision in `vad` if it is running.
|
// `sync_buffer`.
|
||||||
// Returns true if the filter parameters are updated.
|
// Returns true if the filter parameters are updated.
|
||||||
bool Update(const AudioMultiVector& sync_buffer, const PostDecodeVad& vad);
|
bool Update(const AudioMultiVector& sync_buffer);
|
||||||
|
|
||||||
// Generates background noise given a random vector and writes the output to
|
// Generates background noise given a random vector and writes the output to
|
||||||
// `buffer`.
|
// `buffer`.
|
||||||
|
|
|
@ -37,7 +37,6 @@
|
||||||
#include "modules/audio_coding/neteq/normal.h"
|
#include "modules/audio_coding/neteq/normal.h"
|
||||||
#include "modules/audio_coding/neteq/packet.h"
|
#include "modules/audio_coding/neteq/packet.h"
|
||||||
#include "modules/audio_coding/neteq/packet_buffer.h"
|
#include "modules/audio_coding/neteq/packet_buffer.h"
|
||||||
#include "modules/audio_coding/neteq/post_decode_vad.h"
|
|
||||||
#include "modules/audio_coding/neteq/preemptive_expand.h"
|
#include "modules/audio_coding/neteq/preemptive_expand.h"
|
||||||
#include "modules/audio_coding/neteq/red_payload_splitter.h"
|
#include "modules/audio_coding/neteq/red_payload_splitter.h"
|
||||||
#include "modules/audio_coding/neteq/statistics_calculator.h"
|
#include "modules/audio_coding/neteq/statistics_calculator.h"
|
||||||
|
@ -72,49 +71,26 @@ std::unique_ptr<NetEqController> CreateNetEqController(
|
||||||
return controller_factory.CreateNetEqController(config);
|
return controller_factory.CreateNetEqController(config);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SetAudioFrameActivityAndType(bool vad_enabled,
|
AudioFrame::SpeechType ToSpeechType(NetEqImpl::OutputType type) {
|
||||||
NetEqImpl::OutputType type,
|
|
||||||
AudioFrame::VADActivity last_vad_activity,
|
|
||||||
AudioFrame* audio_frame) {
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case NetEqImpl::OutputType::kNormalSpeech: {
|
case NetEqImpl::OutputType::kNormalSpeech: {
|
||||||
audio_frame->speech_type_ = AudioFrame::kNormalSpeech;
|
return AudioFrame::kNormalSpeech;
|
||||||
audio_frame->vad_activity_ = AudioFrame::kVadActive;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case NetEqImpl::OutputType::kVadPassive: {
|
|
||||||
// This should only be reached if the VAD is enabled.
|
|
||||||
RTC_DCHECK(vad_enabled);
|
|
||||||
audio_frame->speech_type_ = AudioFrame::kNormalSpeech;
|
|
||||||
audio_frame->vad_activity_ = AudioFrame::kVadPassive;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
case NetEqImpl::OutputType::kCNG: {
|
case NetEqImpl::OutputType::kCNG: {
|
||||||
audio_frame->speech_type_ = AudioFrame::kCNG;
|
return AudioFrame::kCNG;
|
||||||
audio_frame->vad_activity_ = AudioFrame::kVadPassive;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
case NetEqImpl::OutputType::kPLC: {
|
case NetEqImpl::OutputType::kPLC: {
|
||||||
audio_frame->speech_type_ = AudioFrame::kPLC;
|
return AudioFrame::kPLC;
|
||||||
audio_frame->vad_activity_ = last_vad_activity;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
case NetEqImpl::OutputType::kPLCCNG: {
|
case NetEqImpl::OutputType::kPLCCNG: {
|
||||||
audio_frame->speech_type_ = AudioFrame::kPLCCNG;
|
return AudioFrame::kPLCCNG;
|
||||||
audio_frame->vad_activity_ = AudioFrame::kVadPassive;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
case NetEqImpl::OutputType::kCodecPLC: {
|
case NetEqImpl::OutputType::kCodecPLC: {
|
||||||
audio_frame->speech_type_ = AudioFrame::kCodecPLC;
|
return AudioFrame::kCodecPLC;
|
||||||
audio_frame->vad_activity_ = last_vad_activity;
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
RTC_DCHECK_NOTREACHED();
|
RTC_DCHECK_NOTREACHED();
|
||||||
}
|
return AudioFrame::kUndefined;
|
||||||
if (!vad_enabled) {
|
|
||||||
// Always set kVadUnknown when receive VAD is inactive.
|
|
||||||
audio_frame->vad_activity_ = AudioFrame::kVadUnknown;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -171,7 +147,6 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config,
|
||||||
packet_buffer_(std::move(deps.packet_buffer)),
|
packet_buffer_(std::move(deps.packet_buffer)),
|
||||||
red_payload_splitter_(std::move(deps.red_payload_splitter)),
|
red_payload_splitter_(std::move(deps.red_payload_splitter)),
|
||||||
timestamp_scaler_(std::move(deps.timestamp_scaler)),
|
timestamp_scaler_(std::move(deps.timestamp_scaler)),
|
||||||
vad_(new PostDecodeVad()),
|
|
||||||
expand_factory_(std::move(deps.expand_factory)),
|
expand_factory_(std::move(deps.expand_factory)),
|
||||||
accelerate_factory_(std::move(deps.accelerate_factory)),
|
accelerate_factory_(std::move(deps.accelerate_factory)),
|
||||||
preemptive_expand_factory_(std::move(deps.preemptive_expand_factory)),
|
preemptive_expand_factory_(std::move(deps.preemptive_expand_factory)),
|
||||||
|
@ -215,10 +190,6 @@ NetEqImpl::NetEqImpl(const NetEq::Config& config,
|
||||||
if (create_components) {
|
if (create_components) {
|
||||||
SetSampleRateAndChannels(fs, 1); // Default is 1 channel.
|
SetSampleRateAndChannels(fs, 1); // Default is 1 channel.
|
||||||
}
|
}
|
||||||
RTC_DCHECK(!vad_->enabled());
|
|
||||||
if (config.enable_post_decode_vad) {
|
|
||||||
vad_->Enable();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
NetEqImpl::~NetEqImpl() = default;
|
NetEqImpl::~NetEqImpl() = default;
|
||||||
|
@ -256,9 +227,7 @@ int NetEqImpl::GetAudio(AudioFrame* audio_frame,
|
||||||
audio_frame->sample_rate_hz_,
|
audio_frame->sample_rate_hz_,
|
||||||
rtc::dchecked_cast<int>(audio_frame->samples_per_channel_ * 100));
|
rtc::dchecked_cast<int>(audio_frame->samples_per_channel_ * 100));
|
||||||
RTC_DCHECK_EQ(*muted, audio_frame->muted());
|
RTC_DCHECK_EQ(*muted, audio_frame->muted());
|
||||||
SetAudioFrameActivityAndType(vad_->enabled(), LastOutputType(),
|
audio_frame->speech_type_ = ToSpeechType(LastOutputType());
|
||||||
last_vad_activity_, audio_frame);
|
|
||||||
last_vad_activity_ = audio_frame->vad_activity_;
|
|
||||||
last_output_sample_rate_hz_ = audio_frame->sample_rate_hz_;
|
last_output_sample_rate_hz_ = audio_frame->sample_rate_hz_;
|
||||||
RTC_DCHECK(last_output_sample_rate_hz_ == 8000 ||
|
RTC_DCHECK(last_output_sample_rate_hz_ == 8000 ||
|
||||||
last_output_sample_rate_hz_ == 16000 ||
|
last_output_sample_rate_hz_ == 16000 ||
|
||||||
|
@ -402,18 +371,6 @@ NetEqOperationsAndState NetEqImpl::GetOperationsAndState() const {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void NetEqImpl::EnableVad() {
|
|
||||||
MutexLock lock(&mutex_);
|
|
||||||
RTC_DCHECK(vad_.get());
|
|
||||||
vad_->Enable();
|
|
||||||
}
|
|
||||||
|
|
||||||
void NetEqImpl::DisableVad() {
|
|
||||||
MutexLock lock(&mutex_);
|
|
||||||
RTC_DCHECK(vad_.get());
|
|
||||||
vad_->Disable();
|
|
||||||
}
|
|
||||||
|
|
||||||
absl::optional<uint32_t> NetEqImpl::GetPlayoutTimestamp() const {
|
absl::optional<uint32_t> NetEqImpl::GetPlayoutTimestamp() const {
|
||||||
MutexLock lock(&mutex_);
|
MutexLock lock(&mutex_);
|
||||||
if (first_packet_ || last_mode_ == Mode::kRfc3389Cng ||
|
if (first_packet_ || last_mode_ == Mode::kRfc3389Cng ||
|
||||||
|
@ -874,11 +831,8 @@ int NetEqImpl::GetAudioInternal(AudioFrame* audio_frame,
|
||||||
last_decoded_type_ = speech_type;
|
last_decoded_type_ = speech_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
RTC_DCHECK(vad_.get());
|
|
||||||
bool sid_frame_available =
|
bool sid_frame_available =
|
||||||
(operation == Operation::kRfc3389Cng && !packet_list.empty());
|
(operation == Operation::kRfc3389Cng && !packet_list.empty());
|
||||||
vad_->Update(decoded_buffer_.get(), static_cast<size_t>(length), speech_type,
|
|
||||||
sid_frame_available, fs_hz_);
|
|
||||||
|
|
||||||
// This is the criterion that we did decode some data through the speech
|
// This is the criterion that we did decode some data through the speech
|
||||||
// decoder, and the operation resulted in comfort noise.
|
// decoder, and the operation resulted in comfort noise.
|
||||||
|
@ -1028,7 +982,7 @@ int NetEqImpl::GetAudioInternal(AudioFrame* audio_frame,
|
||||||
(last_mode_ == Mode::kPreemptiveExpandFail) ||
|
(last_mode_ == Mode::kPreemptiveExpandFail) ||
|
||||||
(last_mode_ == Mode::kRfc3389Cng) ||
|
(last_mode_ == Mode::kRfc3389Cng) ||
|
||||||
(last_mode_ == Mode::kCodecInternalCng)) {
|
(last_mode_ == Mode::kCodecInternalCng)) {
|
||||||
background_noise_->Update(*sync_buffer_, *vad_.get());
|
background_noise_->Update(*sync_buffer_);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (operation == Operation::kDtmf) {
|
if (operation == Operation::kDtmf) {
|
||||||
|
@ -2104,10 +2058,6 @@ void NetEqImpl::SetSampleRateAndChannels(int fs_hz, size_t channels) {
|
||||||
if (cng_decoder)
|
if (cng_decoder)
|
||||||
cng_decoder->Reset();
|
cng_decoder->Reset();
|
||||||
|
|
||||||
// Reinit post-decode VAD with new sample rate.
|
|
||||||
RTC_DCHECK(vad_.get()); // Cannot be NULL here.
|
|
||||||
vad_->Init();
|
|
||||||
|
|
||||||
// Delete algorithm buffer and create a new one.
|
// Delete algorithm buffer and create a new one.
|
||||||
algorithm_buffer_.reset(new AudioMultiVector(channels));
|
algorithm_buffer_.reset(new AudioMultiVector(channels));
|
||||||
|
|
||||||
|
@ -2148,7 +2098,6 @@ void NetEqImpl::SetSampleRateAndChannels(int fs_hz, size_t channels) {
|
||||||
}
|
}
|
||||||
|
|
||||||
NetEqImpl::OutputType NetEqImpl::LastOutputType() {
|
NetEqImpl::OutputType NetEqImpl::LastOutputType() {
|
||||||
RTC_DCHECK(vad_.get());
|
|
||||||
RTC_DCHECK(expand_.get());
|
RTC_DCHECK(expand_.get());
|
||||||
if (last_mode_ == Mode::kCodecInternalCng ||
|
if (last_mode_ == Mode::kCodecInternalCng ||
|
||||||
last_mode_ == Mode::kRfc3389Cng) {
|
last_mode_ == Mode::kRfc3389Cng) {
|
||||||
|
@ -2158,8 +2107,6 @@ NetEqImpl::OutputType NetEqImpl::LastOutputType() {
|
||||||
return OutputType::kPLCCNG;
|
return OutputType::kPLCCNG;
|
||||||
} else if (last_mode_ == Mode::kExpand) {
|
} else if (last_mode_ == Mode::kExpand) {
|
||||||
return OutputType::kPLC;
|
return OutputType::kPLC;
|
||||||
} else if (vad_->running() && !vad_->active_speech()) {
|
|
||||||
return OutputType::kVadPassive;
|
|
||||||
} else if (last_mode_ == Mode::kCodecPlc) {
|
} else if (last_mode_ == Mode::kCodecPlc) {
|
||||||
return OutputType::kCodecPLC;
|
return OutputType::kCodecPLC;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -48,7 +48,6 @@ class Merge;
|
||||||
class NackTracker;
|
class NackTracker;
|
||||||
class Normal;
|
class Normal;
|
||||||
class RedPayloadSplitter;
|
class RedPayloadSplitter;
|
||||||
class PostDecodeVad;
|
|
||||||
class PreemptiveExpand;
|
class PreemptiveExpand;
|
||||||
class RandomVector;
|
class RandomVector;
|
||||||
class SyncBuffer;
|
class SyncBuffer;
|
||||||
|
@ -171,13 +170,6 @@ class NetEqImpl : public webrtc::NetEq {
|
||||||
|
|
||||||
NetEqOperationsAndState GetOperationsAndState() const override;
|
NetEqOperationsAndState GetOperationsAndState() const override;
|
||||||
|
|
||||||
// Enables post-decode VAD. When enabled, GetAudio() will return
|
|
||||||
// kOutputVADPassive when the signal contains no speech.
|
|
||||||
void EnableVad() override;
|
|
||||||
|
|
||||||
// Disables post-decode VAD.
|
|
||||||
void DisableVad() override;
|
|
||||||
|
|
||||||
absl::optional<uint32_t> GetPlayoutTimestamp() const override;
|
absl::optional<uint32_t> GetPlayoutTimestamp() const override;
|
||||||
|
|
||||||
int last_output_sample_rate_hz() const override;
|
int last_output_sample_rate_hz() const override;
|
||||||
|
@ -359,7 +351,6 @@ class NetEqImpl : public webrtc::NetEq {
|
||||||
RTC_GUARDED_BY(mutex_);
|
RTC_GUARDED_BY(mutex_);
|
||||||
const std::unique_ptr<TimestampScaler> timestamp_scaler_
|
const std::unique_ptr<TimestampScaler> timestamp_scaler_
|
||||||
RTC_GUARDED_BY(mutex_);
|
RTC_GUARDED_BY(mutex_);
|
||||||
const std::unique_ptr<PostDecodeVad> vad_ RTC_GUARDED_BY(mutex_);
|
|
||||||
const std::unique_ptr<ExpandFactory> expand_factory_ RTC_GUARDED_BY(mutex_);
|
const std::unique_ptr<ExpandFactory> expand_factory_ RTC_GUARDED_BY(mutex_);
|
||||||
const std::unique_ptr<AccelerateFactory> accelerate_factory_
|
const std::unique_ptr<AccelerateFactory> accelerate_factory_
|
||||||
RTC_GUARDED_BY(mutex_);
|
RTC_GUARDED_BY(mutex_);
|
||||||
|
@ -401,8 +392,6 @@ class NetEqImpl : public webrtc::NetEq {
|
||||||
std::unique_ptr<NackTracker> nack_ RTC_GUARDED_BY(mutex_);
|
std::unique_ptr<NackTracker> nack_ RTC_GUARDED_BY(mutex_);
|
||||||
bool nack_enabled_ RTC_GUARDED_BY(mutex_);
|
bool nack_enabled_ RTC_GUARDED_BY(mutex_);
|
||||||
const bool enable_muted_state_ RTC_GUARDED_BY(mutex_);
|
const bool enable_muted_state_ RTC_GUARDED_BY(mutex_);
|
||||||
AudioFrame::VADActivity last_vad_activity_ RTC_GUARDED_BY(mutex_) =
|
|
||||||
AudioFrame::kVadPassive;
|
|
||||||
std::unique_ptr<TickTimer::Stopwatch> generated_noise_stopwatch_
|
std::unique_ptr<TickTimer::Stopwatch> generated_noise_stopwatch_
|
||||||
RTC_GUARDED_BY(mutex_);
|
RTC_GUARDED_BY(mutex_);
|
||||||
std::vector<RtpPacketInfo> last_decoded_packet_infos_ RTC_GUARDED_BY(mutex_);
|
std::vector<RtpPacketInfo> last_decoded_packet_infos_ RTC_GUARDED_BY(mutex_);
|
||||||
|
|
|
@ -1,90 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "modules/audio_coding/neteq/post_decode_vad.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
|
|
||||||
PostDecodeVad::~PostDecodeVad() {
|
|
||||||
if (vad_instance_)
|
|
||||||
WebRtcVad_Free(vad_instance_);
|
|
||||||
}
|
|
||||||
|
|
||||||
void PostDecodeVad::Enable() {
|
|
||||||
if (!vad_instance_) {
|
|
||||||
// Create the instance.
|
|
||||||
vad_instance_ = WebRtcVad_Create();
|
|
||||||
if (vad_instance_ == nullptr) {
|
|
||||||
// Failed to create instance.
|
|
||||||
Disable();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Init();
|
|
||||||
enabled_ = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PostDecodeVad::Disable() {
|
|
||||||
enabled_ = false;
|
|
||||||
running_ = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
void PostDecodeVad::Init() {
|
|
||||||
running_ = false;
|
|
||||||
if (vad_instance_) {
|
|
||||||
WebRtcVad_Init(vad_instance_);
|
|
||||||
WebRtcVad_set_mode(vad_instance_, kVadMode);
|
|
||||||
running_ = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void PostDecodeVad::Update(int16_t* signal,
|
|
||||||
size_t length,
|
|
||||||
AudioDecoder::SpeechType speech_type,
|
|
||||||
bool sid_frame,
|
|
||||||
int fs_hz) {
|
|
||||||
if (!vad_instance_ || !enabled_) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (speech_type == AudioDecoder::kComfortNoise || sid_frame ||
|
|
||||||
fs_hz > 16000) {
|
|
||||||
// TODO(hlundin): Remove restriction on fs_hz.
|
|
||||||
running_ = false;
|
|
||||||
active_speech_ = true;
|
|
||||||
sid_interval_counter_ = 0;
|
|
||||||
} else if (!running_) {
|
|
||||||
++sid_interval_counter_;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sid_interval_counter_ >= kVadAutoEnable) {
|
|
||||||
Init();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (length > 0 && running_) {
|
|
||||||
size_t vad_sample_index = 0;
|
|
||||||
active_speech_ = false;
|
|
||||||
// Loop through frame sizes 30, 20, and 10 ms.
|
|
||||||
for (int vad_frame_size_ms = 30; vad_frame_size_ms >= 10;
|
|
||||||
vad_frame_size_ms -= 10) {
|
|
||||||
size_t vad_frame_size_samples =
|
|
||||||
static_cast<size_t>(vad_frame_size_ms * fs_hz / 1000);
|
|
||||||
while (length - vad_sample_index >= vad_frame_size_samples) {
|
|
||||||
int vad_return =
|
|
||||||
WebRtcVad_Process(vad_instance_, fs_hz, &signal[vad_sample_index],
|
|
||||||
vad_frame_size_samples);
|
|
||||||
active_speech_ |= (vad_return == 1);
|
|
||||||
vad_sample_index += vad_frame_size_samples;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace webrtc
|
|
|
@ -1,71 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef MODULES_AUDIO_CODING_NETEQ_POST_DECODE_VAD_H_
|
|
||||||
#define MODULES_AUDIO_CODING_NETEQ_POST_DECODE_VAD_H_
|
|
||||||
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
|
|
||||||
#include "api/audio_codecs/audio_decoder.h"
|
|
||||||
#include "common_audio/vad/include/webrtc_vad.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
|
|
||||||
class PostDecodeVad {
|
|
||||||
public:
|
|
||||||
PostDecodeVad()
|
|
||||||
: enabled_(false),
|
|
||||||
running_(false),
|
|
||||||
active_speech_(true),
|
|
||||||
sid_interval_counter_(0),
|
|
||||||
vad_instance_(NULL) {}
|
|
||||||
|
|
||||||
virtual ~PostDecodeVad();
|
|
||||||
|
|
||||||
PostDecodeVad(const PostDecodeVad&) = delete;
|
|
||||||
PostDecodeVad& operator=(const PostDecodeVad&) = delete;
|
|
||||||
|
|
||||||
// Enables post-decode VAD.
|
|
||||||
void Enable();
|
|
||||||
|
|
||||||
// Disables post-decode VAD.
|
|
||||||
void Disable();
|
|
||||||
|
|
||||||
// Initializes post-decode VAD.
|
|
||||||
void Init();
|
|
||||||
|
|
||||||
// Updates post-decode VAD with the audio data in `signal` having `length`
|
|
||||||
// samples. The data is of type `speech_type`, at the sample rate `fs_hz`.
|
|
||||||
void Update(int16_t* signal,
|
|
||||||
size_t length,
|
|
||||||
AudioDecoder::SpeechType speech_type,
|
|
||||||
bool sid_frame,
|
|
||||||
int fs_hz);
|
|
||||||
|
|
||||||
// Accessors.
|
|
||||||
bool enabled() const { return enabled_; }
|
|
||||||
bool running() const { return running_; }
|
|
||||||
bool active_speech() const { return active_speech_; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
static const int kVadMode = 0; // Sets aggressiveness to "Normal".
|
|
||||||
// Number of Update() calls without CNG/SID before re-enabling VAD.
|
|
||||||
static const int kVadAutoEnable = 3000;
|
|
||||||
|
|
||||||
bool enabled_;
|
|
||||||
bool running_;
|
|
||||||
bool active_speech_;
|
|
||||||
int sid_interval_counter_;
|
|
||||||
::VadInst* vad_instance_;
|
|
||||||
};
|
|
||||||
|
|
||||||
} // namespace webrtc
|
|
||||||
#endif // MODULES_AUDIO_CODING_NETEQ_POST_DECODE_VAD_H_
|
|
|
@ -1,25 +0,0 @@
|
||||||
/*
|
|
||||||
* Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
|
|
||||||
*
|
|
||||||
* Use of this source code is governed by a BSD-style license
|
|
||||||
* that can be found in the LICENSE file in the root of the source
|
|
||||||
* tree. An additional intellectual property rights grant can be found
|
|
||||||
* in the file PATENTS. All contributing project authors may
|
|
||||||
* be found in the AUTHORS file in the root of the source tree.
|
|
||||||
*/
|
|
||||||
|
|
||||||
// Unit tests for PostDecodeVad class.
|
|
||||||
|
|
||||||
#include "modules/audio_coding/neteq/post_decode_vad.h"
|
|
||||||
|
|
||||||
#include "test/gtest.h"
|
|
||||||
|
|
||||||
namespace webrtc {
|
|
||||||
|
|
||||||
TEST(PostDecodeVad, CreateAndDestroy) {
|
|
||||||
PostDecodeVad vad;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(hlundin): Write more tests.
|
|
||||||
|
|
||||||
} // namespace webrtc
|
|
|
@ -179,7 +179,6 @@ void FuzzOneInputTest(const uint8_t* data, size_t size) {
|
||||||
// Configure NetEq and the NetEqTest object.
|
// Configure NetEq and the NetEqTest object.
|
||||||
NetEqTest::Callbacks callbacks;
|
NetEqTest::Callbacks callbacks;
|
||||||
NetEq::Config config;
|
NetEq::Config config;
|
||||||
config.enable_post_decode_vad = true;
|
|
||||||
config.enable_fast_accelerate = true;
|
config.enable_fast_accelerate = true;
|
||||||
auto codecs = NetEqTest::StandardDecoderMap();
|
auto codecs = NetEqTest::StandardDecoderMap();
|
||||||
// rate_types contains the payload types that will be used for encoding.
|
// rate_types contains the payload types that will be used for encoding.
|
||||||
|
|
Loading…
Reference in a new issue