mirror of
https://github.com/mollyim/webrtc.git
synced 2025-05-15 14:50:39 +01:00

Bug: webrtc:13579 Change-Id: Idb05a64cfd16aed68d40cd427a6b516caa5e2077 Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/269387 Reviewed-by: Ivo Creusen <ivoc@webrtc.org> Commit-Queue: Ali Tofigh <alito@webrtc.org> Cr-Commit-Position: refs/heads/main@{#37800}
675 lines
24 KiB
C++
675 lines
24 KiB
C++
/*
|
|
* Copyright (c) 2017 The WebRTC project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
// This file consists of unit tests for webrtc::test::conversational_speech
|
|
// members. Part of them focus on accepting or rejecting different
|
|
// conversational speech setups. A setup is defined by a set of audio tracks and
|
|
// timing information).
|
|
// The docstring at the beginning of each TEST(ConversationalSpeechTest,
|
|
// MultiEndCallSetup*) function looks like the drawing below and indicates which
|
|
// setup is tested.
|
|
//
|
|
// Accept:
|
|
// A 0****.....
|
|
// B .....1****
|
|
//
|
|
// The drawing indicates the following:
|
|
// - the illustrated setup should be accepted,
|
|
// - there are two speakers (namely, A and B),
|
|
// - A is the first speaking, B is the second one,
|
|
// - each character after the speaker's letter indicates a time unit (e.g., 100
|
|
// ms),
|
|
// - "*" indicates speaking, "." listening,
|
|
// - numbers indicate the turn index in std::vector<Turn>.
|
|
//
|
|
// Note that the same speaker can appear in multiple lines in order to depict
|
|
// cases in which there are wrong offsets leading to self cross-talk (which is
|
|
// rejected).
|
|
|
|
// MSVC++ requires this to be set before any other includes to get M_PI.
|
|
#define _USE_MATH_DEFINES
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <cmath>
|
|
#include <map>
|
|
#include <memory>
|
|
#include <vector>
|
|
|
|
#include "absl/strings/string_view.h"
|
|
#include "absl/types/optional.h"
|
|
#include "common_audio/wav_file.h"
|
|
#include "modules/audio_processing/test/conversational_speech/config.h"
|
|
#include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h"
|
|
#include "modules/audio_processing/test/conversational_speech/multiend_call.h"
|
|
#include "modules/audio_processing/test/conversational_speech/simulator.h"
|
|
#include "modules/audio_processing/test/conversational_speech/timing.h"
|
|
#include "modules/audio_processing/test/conversational_speech/wavreader_factory.h"
|
|
#include "rtc_base/logging.h"
|
|
#include "test/gmock.h"
|
|
#include "test/gtest.h"
|
|
#include "test/testsupport/file_utils.h"
|
|
|
|
namespace webrtc {
|
|
namespace test {
|
|
namespace {
|
|
|
|
using conversational_speech::LoadTiming;
|
|
using conversational_speech::MockWavReaderFactory;
|
|
using conversational_speech::MultiEndCall;
|
|
using conversational_speech::SaveTiming;
|
|
using conversational_speech::Turn;
|
|
using conversational_speech::WavReaderFactory;
|
|
|
|
const char* const audiotracks_path = "/path/to/audiotracks";
|
|
const char* const timing_filepath = "/path/to/timing_file.txt";
|
|
const char* const output_path = "/path/to/output_dir";
|
|
|
|
const std::vector<Turn> expected_timing = {
|
|
{"A", "a1", 0, 0}, {"B", "b1", 0, 0}, {"A", "a2", 100, 0},
|
|
{"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0},
|
|
};
|
|
const std::size_t kNumberOfTurns = expected_timing.size();
|
|
|
|
// Default arguments for MockWavReaderFactory ctor.
|
|
// Fake audio track parameters.
|
|
constexpr int kDefaultSampleRate = 48000;
|
|
const std::map<std::string, const MockWavReaderFactory::Params>
|
|
kDefaultMockWavReaderFactoryParamsMap = {
|
|
{"t300", {kDefaultSampleRate, 1u, 14400u}}, // Mono, 0.3 seconds.
|
|
{"t500", {kDefaultSampleRate, 1u, 24000u}}, // Mono, 0.5 seconds.
|
|
{"t1000", {kDefaultSampleRate, 1u, 48000u}}, // Mono, 1.0 seconds.
|
|
{"sr8000", {8000, 1u, 8000u}}, // 8kHz sample rate, mono, 1 second.
|
|
{"sr16000", {16000, 1u, 16000u}}, // 16kHz sample rate, mono, 1 second.
|
|
{"sr16000_stereo", {16000, 2u, 16000u}}, // Like sr16000, but stereo.
|
|
};
|
|
const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams =
|
|
kDefaultMockWavReaderFactoryParamsMap.at("t500");
|
|
|
|
std::unique_ptr<MockWavReaderFactory> CreateMockWavReaderFactory() {
|
|
return std::unique_ptr<MockWavReaderFactory>(
|
|
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
|
kDefaultMockWavReaderFactoryParamsMap));
|
|
}
|
|
|
|
void CreateSineWavFile(absl::string_view filepath,
|
|
const MockWavReaderFactory::Params& params,
|
|
float frequency = 440.0f) {
|
|
// Create samples.
|
|
constexpr double two_pi = 2.0 * M_PI;
|
|
std::vector<int16_t> samples(params.num_samples);
|
|
for (std::size_t i = 0; i < params.num_samples; ++i) {
|
|
// TODO(alessiob): the produced tone is not pure, improve.
|
|
samples[i] = std::lround(
|
|
32767.0f * std::sin(two_pi * i * frequency / params.sample_rate));
|
|
}
|
|
|
|
// Write samples.
|
|
WavWriter wav_writer(filepath, params.sample_rate, params.num_channels);
|
|
wav_writer.WriteSamples(samples.data(), params.num_samples);
|
|
}
|
|
|
|
// Parameters to generate audio tracks with CreateSineWavFile.
|
|
struct SineAudioTrackParams {
|
|
MockWavReaderFactory::Params params;
|
|
float frequency;
|
|
};
|
|
|
|
// Creates a temporary directory in which sine audio tracks are written.
|
|
std::string CreateTemporarySineAudioTracks(
|
|
const std::map<std::string, SineAudioTrackParams>& sine_tracks_params) {
|
|
// Create temporary directory.
|
|
std::string temp_directory =
|
|
OutputPath() + "TempConversationalSpeechAudioTracks";
|
|
CreateDir(temp_directory);
|
|
|
|
// Create sine tracks.
|
|
for (const auto& it : sine_tracks_params) {
|
|
const std::string temp_filepath = JoinFilename(temp_directory, it.first);
|
|
CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency);
|
|
}
|
|
|
|
return temp_directory;
|
|
}
|
|
|
|
void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory,
|
|
absl::string_view filepath,
|
|
const MockWavReaderFactory::Params& expeted_params) {
|
|
auto wav_reader = wav_reader_factory.Create(filepath);
|
|
EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate());
|
|
EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels());
|
|
EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples());
|
|
}
|
|
|
|
void DeleteFolderAndContents(absl::string_view dir) {
|
|
if (!DirExists(dir)) {
|
|
return;
|
|
}
|
|
absl::optional<std::vector<std::string>> dir_content = ReadDirectory(dir);
|
|
EXPECT_TRUE(dir_content);
|
|
for (const auto& path : *dir_content) {
|
|
if (DirExists(path)) {
|
|
DeleteFolderAndContents(path);
|
|
} else if (FileExists(path)) {
|
|
// TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
|
|
RemoveFile(path);
|
|
} else {
|
|
FAIL();
|
|
}
|
|
}
|
|
// TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed.
|
|
RemoveDir(dir);
|
|
}
|
|
|
|
} // namespace
|
|
|
|
using ::testing::_;
|
|
|
|
TEST(ConversationalSpeechTest, Settings) {
|
|
const conversational_speech::Config config(audiotracks_path, timing_filepath,
|
|
output_path);
|
|
|
|
// Test getters.
|
|
EXPECT_EQ(audiotracks_path, config.audiotracks_path());
|
|
EXPECT_EQ(timing_filepath, config.timing_filepath());
|
|
EXPECT_EQ(output_path, config.output_path());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, TimingSaveLoad) {
|
|
// Save test timing.
|
|
const std::string temporary_filepath =
|
|
TempFilename(OutputPath(), "TempTimingTestFile");
|
|
SaveTiming(temporary_filepath, expected_timing);
|
|
|
|
// Create a std::vector<Turn> instance by loading from file.
|
|
std::vector<Turn> actual_timing = LoadTiming(temporary_filepath);
|
|
RemoveFile(temporary_filepath);
|
|
|
|
// Check size.
|
|
EXPECT_EQ(expected_timing.size(), actual_timing.size());
|
|
|
|
// Check Turn instances.
|
|
for (size_t index = 0; index < expected_timing.size(); ++index) {
|
|
EXPECT_EQ(expected_timing[index], actual_timing[index])
|
|
<< "turn #" << index << " not matching";
|
|
}
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallCreate) {
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are 5 unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5);
|
|
|
|
// Inject the mock wav reader factory.
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
expected_timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(5u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(6u, multiend_call.speaking_turns().size());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) {
|
|
const std::vector<Turn> timing = {
|
|
{"A", "sr8000", 0, 0},
|
|
{"B", "sr16000", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
|
|
|
|
MultiEndCall multiend_call(timing, audiotracks_path,
|
|
std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) {
|
|
const std::vector<Turn> timing = {
|
|
{"A", "sr16000_stereo", 0, 0},
|
|
{"B", "sr16000_stereo", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1);
|
|
|
|
MultiEndCall multiend_call(timing, audiotracks_path,
|
|
std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest,
|
|
MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) {
|
|
const std::vector<Turn> timing = {
|
|
{"A", "sr8000", 0, 0},
|
|
{"B", "sr16000_stereo", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2);
|
|
|
|
MultiEndCall multiend_call(timing, audiotracks_path,
|
|
std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) {
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", -100, 0},
|
|
{"B", "t500", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) {
|
|
// Accept:
|
|
// A 0****.....
|
|
// B .....1****
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0},
|
|
{"B", "t500", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupPause) {
|
|
// Accept:
|
|
// A 0****.......
|
|
// B .......1****
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0},
|
|
{"B", "t500", 200, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) {
|
|
// Accept:
|
|
// A 0****....
|
|
// B ....1****
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0},
|
|
{"B", "t500", -100, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) {
|
|
// Reject:
|
|
// A ..0****
|
|
// B .1****. The n-th turn cannot start before the (n-1)-th one.
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 200, 0},
|
|
{"B", "t500", -600, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) {
|
|
// Accept:
|
|
// A 0****2****...
|
|
// B ...1*********
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0},
|
|
{"B", "t1000", -200, 0},
|
|
{"A", "t500", -800, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) {
|
|
// Reject:
|
|
// A 0****......
|
|
// A ...1****...
|
|
// B ......2****
|
|
// ^ Turn #1 overlaps with #0 which is from the same speaker.
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0},
|
|
{"A", "t500", -200, 0},
|
|
{"B", "t500", -200, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) {
|
|
// Reject:
|
|
// A 0*********
|
|
// B 1**.......
|
|
// C ...2**....
|
|
// A ......3**.
|
|
// ^ Turn #3 overlaps with #0 which is from the same speaker.
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t1000", 0, 0},
|
|
{"B", "t300", -1000, 0},
|
|
{"C", "t300", 0, 0},
|
|
{"A", "t300", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) {
|
|
// Accept:
|
|
// A 0*********..
|
|
// B ..1****.....
|
|
// C .......2****
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t1000", 0, 0},
|
|
{"B", "t500", -800, 0},
|
|
{"C", "t500", 0, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) {
|
|
// Reject:
|
|
// A 0*********
|
|
// B ..1****...
|
|
// C ....2****.
|
|
// ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers
|
|
// not permitted).
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t1000", 0, 0},
|
|
{"B", "t500", -800, 0},
|
|
{"C", "t500", -300, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) {
|
|
// Accept:
|
|
// A 0*********..
|
|
// B .2****......
|
|
// C .......3****
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t1000", 0, 0},
|
|
{"B", "t500", -900, 0},
|
|
{"C", "t500", 100, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(3u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) {
|
|
// Accept:
|
|
// A 0****
|
|
// B 1****
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0},
|
|
{"B", "t500", -500, 0},
|
|
};
|
|
auto mock_wavreader_factory = CreateMockWavReaderFactory();
|
|
|
|
// There is one unique audio track to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(2u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(1u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(2u, multiend_call.speaking_turns().size());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) {
|
|
// Accept:
|
|
// A 0****....3****.5**.
|
|
// B .....1****...4**...
|
|
// C ......2**.......6**..
|
|
constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9;
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
|
|
{"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -100, 0},
|
|
{"C", "t300", -200, 0},
|
|
};
|
|
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
|
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
|
kDefaultMockWavReaderFactoryParamsMap));
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_TRUE(multiend_call.valid());
|
|
|
|
// Test.
|
|
EXPECT_EQ(3u, multiend_call.speaker_names().size());
|
|
EXPECT_EQ(2u, multiend_call.audiotrack_readers().size());
|
|
EXPECT_EQ(7u, multiend_call.speaking_turns().size());
|
|
EXPECT_EQ(expected_duration, multiend_call.total_duration_samples());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) {
|
|
// Reject:
|
|
// A 0****....3****.6**
|
|
// B .....1****...4**..
|
|
// C ......2**.....5**..
|
|
// ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+
|
|
// speakers not permitted).
|
|
const std::vector<Turn> timing = {
|
|
{"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0},
|
|
{"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -200, 0},
|
|
{"C", "t300", -200, 0},
|
|
};
|
|
auto mock_wavreader_factory = std::unique_ptr<MockWavReaderFactory>(
|
|
new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams,
|
|
kDefaultMockWavReaderFactoryParamsMap));
|
|
|
|
// There are two unique audio tracks to read.
|
|
EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2);
|
|
|
|
conversational_speech::MultiEndCall multiend_call(
|
|
timing, audiotracks_path, std::move(mock_wavreader_factory));
|
|
EXPECT_FALSE(multiend_call.valid());
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) {
|
|
// Parameters with which wav files are created.
|
|
constexpr int duration_seconds = 5;
|
|
const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000};
|
|
|
|
for (int sample_rate : sample_rates) {
|
|
const std::string temp_filename = OutputPath() + "TempSineWavFile_" +
|
|
std::to_string(sample_rate) + ".wav";
|
|
|
|
// Write wav file.
|
|
const std::size_t num_samples = duration_seconds * sample_rate;
|
|
MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples};
|
|
CreateSineWavFile(temp_filename, params);
|
|
|
|
// Load wav file and check if params match.
|
|
WavReaderFactory wav_reader_factory;
|
|
MockWavReaderFactory::Params expeted_params = {sample_rate, 1u,
|
|
num_samples};
|
|
CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params);
|
|
|
|
// Clean up.
|
|
RemoveFile(temp_filename);
|
|
}
|
|
}
|
|
|
|
TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) {
|
|
// Simulated call (one character corresponding to 500 ms):
|
|
// A 0*********...........2*********.....
|
|
// B ...........1*********.....3*********
|
|
const std::vector<Turn> expected_timing = {
|
|
{"A", "t5000_440.wav", 0, 0},
|
|
{"B", "t5000_880.wav", 500, 0},
|
|
{"A", "t5000_440.wav", 0, 0},
|
|
{"B", "t5000_880.wav", -2500, 0},
|
|
};
|
|
const std::size_t expected_duration_seconds = 18;
|
|
|
|
// Create temporary audio track files.
|
|
const int sample_rate = 16000;
|
|
const std::map<std::string, SineAudioTrackParams> sine_tracks_params = {
|
|
{"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}},
|
|
{"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}},
|
|
};
|
|
const std::string audiotracks_path =
|
|
CreateTemporarySineAudioTracks(sine_tracks_params);
|
|
|
|
// Set up the multi-end call.
|
|
auto wavreader_factory =
|
|
std::unique_ptr<WavReaderFactory>(new WavReaderFactory());
|
|
MultiEndCall multiend_call(expected_timing, audiotracks_path,
|
|
std::move(wavreader_factory));
|
|
|
|
// Simulate the call.
|
|
std::string output_path = JoinFilename(audiotracks_path, "output");
|
|
CreateDir(output_path);
|
|
RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path;
|
|
auto generated_audiotrak_pairs =
|
|
conversational_speech::Simulate(multiend_call, output_path);
|
|
EXPECT_EQ(2u, generated_audiotrak_pairs->size());
|
|
|
|
// Check the output.
|
|
WavReaderFactory wav_reader_factory;
|
|
const MockWavReaderFactory::Params expeted_params = {
|
|
sample_rate, 1u, sample_rate * expected_duration_seconds};
|
|
for (const auto& it : *generated_audiotrak_pairs) {
|
|
RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">";
|
|
CheckAudioTrackParams(wav_reader_factory, it.second.near_end,
|
|
expeted_params);
|
|
CheckAudioTrackParams(wav_reader_factory, it.second.far_end,
|
|
expeted_params);
|
|
}
|
|
|
|
// Clean.
|
|
EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path));
|
|
}
|
|
|
|
} // namespace test
|
|
} // namespace webrtc
|