/* * Copyright (c) 2017 The WebRTC project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source * tree. An additional intellectual property rights grant can be found * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ // This file consists of unit tests for webrtc::test::conversational_speech // members. Part of them focus on accepting or rejecting different // conversational speech setups. A setup is defined by a set of audio tracks and // timing information). // The docstring at the beginning of each TEST(ConversationalSpeechTest, // MultiEndCallSetup*) function looks like the drawing below and indicates which // setup is tested. // // Accept: // A 0****..... // B .....1**** // // The drawing indicates the following: // - the illustrated setup should be accepted, // - there are two speakers (namely, A and B), // - A is the first speaking, B is the second one, // - each character after the speaker's letter indicates a time unit (e.g., 100 // ms), // - "*" indicates speaking, "." listening, // - numbers indicate the turn index in std::vector. // // Note that the same speaker can appear in multiple lines in order to depict // cases in which there are wrong offsets leading to self cross-talk (which is // rejected). // MSVC++ requires this to be set before any other includes to get M_PI. #define _USE_MATH_DEFINES #include #include #include #include #include #include "absl/types/optional.h" #include "common_audio/wav_file.h" #include "modules/audio_processing/test/conversational_speech/config.h" #include "modules/audio_processing/test/conversational_speech/mock_wavreader_factory.h" #include "modules/audio_processing/test/conversational_speech/multiend_call.h" #include "modules/audio_processing/test/conversational_speech/simulator.h" #include "modules/audio_processing/test/conversational_speech/timing.h" #include "modules/audio_processing/test/conversational_speech/wavreader_factory.h" #include "rtc_base/logging.h" #include "test/gmock.h" #include "test/gtest.h" #include "test/testsupport/file_utils.h" namespace webrtc { namespace test { namespace { using conversational_speech::LoadTiming; using conversational_speech::MockWavReaderFactory; using conversational_speech::MultiEndCall; using conversational_speech::SaveTiming; using conversational_speech::Turn; using conversational_speech::WavReaderFactory; const char* const audiotracks_path = "/path/to/audiotracks"; const char* const timing_filepath = "/path/to/timing_file.txt"; const char* const output_path = "/path/to/output_dir"; const std::vector expected_timing = { {"A", "a1", 0, 0}, {"B", "b1", 0, 0}, {"A", "a2", 100, 0}, {"B", "b2", -200, 0}, {"A", "a3", 0, 0}, {"A", "a3", 0, 0}, }; const std::size_t kNumberOfTurns = expected_timing.size(); // Default arguments for MockWavReaderFactory ctor. // Fake audio track parameters. constexpr int kDefaultSampleRate = 48000; const std::map kDefaultMockWavReaderFactoryParamsMap = { {"t300", {kDefaultSampleRate, 1u, 14400u}}, // Mono, 0.3 seconds. {"t500", {kDefaultSampleRate, 1u, 24000u}}, // Mono, 0.5 seconds. {"t1000", {kDefaultSampleRate, 1u, 48000u}}, // Mono, 1.0 seconds. {"sr8000", {8000, 1u, 8000u}}, // 8kHz sample rate, mono, 1 second. {"sr16000", {16000, 1u, 16000u}}, // 16kHz sample rate, mono, 1 second. {"sr16000_stereo", {16000, 2u, 16000u}}, // Like sr16000, but stereo. }; const MockWavReaderFactory::Params& kDefaultMockWavReaderFactoryParams = kDefaultMockWavReaderFactoryParamsMap.at("t500"); std::unique_ptr CreateMockWavReaderFactory() { return std::unique_ptr( new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, kDefaultMockWavReaderFactoryParamsMap)); } void CreateSineWavFile(const std::string& filepath, const MockWavReaderFactory::Params& params, float frequency = 440.0f) { // Create samples. constexpr double two_pi = 2.0 * M_PI; std::vector samples(params.num_samples); for (std::size_t i = 0; i < params.num_samples; ++i) { // TODO(alessiob): the produced tone is not pure, improve. samples[i] = std::lround( 32767.0f * std::sin(two_pi * i * frequency / params.sample_rate)); } // Write samples. WavWriter wav_writer(filepath, params.sample_rate, params.num_channels); wav_writer.WriteSamples(samples.data(), params.num_samples); } // Parameters to generate audio tracks with CreateSineWavFile. struct SineAudioTrackParams { MockWavReaderFactory::Params params; float frequency; }; // Creates a temporary directory in which sine audio tracks are written. std::string CreateTemporarySineAudioTracks( const std::map& sine_tracks_params) { // Create temporary directory. std::string temp_directory = OutputPath() + "TempConversationalSpeechAudioTracks"; CreateDir(temp_directory); // Create sine tracks. for (const auto& it : sine_tracks_params) { const std::string temp_filepath = JoinFilename(temp_directory, it.first); CreateSineWavFile(temp_filepath, it.second.params, it.second.frequency); } return temp_directory; } void CheckAudioTrackParams(const WavReaderFactory& wav_reader_factory, const std::string& filepath, const MockWavReaderFactory::Params& expeted_params) { auto wav_reader = wav_reader_factory.Create(filepath); EXPECT_EQ(expeted_params.sample_rate, wav_reader->SampleRate()); EXPECT_EQ(expeted_params.num_channels, wav_reader->NumChannels()); EXPECT_EQ(expeted_params.num_samples, wav_reader->NumSamples()); } void DeleteFolderAndContents(const std::string& dir) { if (!DirExists(dir)) { return; } absl::optional> dir_content = ReadDirectory(dir); EXPECT_TRUE(dir_content); for (const auto& path : *dir_content) { if (DirExists(path)) { DeleteFolderAndContents(path); } else if (FileExists(path)) { // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed. RemoveFile(path); } else { FAIL(); } } // TODO(alessiob): Wrap with EXPECT_TRUE() once webrtc:7769 bug fixed. RemoveDir(dir); } } // namespace using ::testing::_; TEST(ConversationalSpeechTest, Settings) { const conversational_speech::Config config(audiotracks_path, timing_filepath, output_path); // Test getters. EXPECT_EQ(audiotracks_path, config.audiotracks_path()); EXPECT_EQ(timing_filepath, config.timing_filepath()); EXPECT_EQ(output_path, config.output_path()); } TEST(ConversationalSpeechTest, TimingSaveLoad) { // Save test timing. const std::string temporary_filepath = TempFilename(OutputPath(), "TempTimingTestFile"); SaveTiming(temporary_filepath, expected_timing); // Create a std::vector instance by loading from file. std::vector actual_timing = LoadTiming(temporary_filepath); RemoveFile(temporary_filepath); // Check size. EXPECT_EQ(expected_timing.size(), actual_timing.size()); // Check Turn instances. for (size_t index = 0; index < expected_timing.size(); ++index) { EXPECT_EQ(expected_timing[index], actual_timing[index]) << "turn #" << index << " not matching"; } } TEST(ConversationalSpeechTest, MultiEndCallCreate) { auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are 5 unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(5); // Inject the mock wav reader factory. conversational_speech::MultiEndCall multiend_call( expected_timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(5u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(6u, multiend_call.speaking_turns().size()); } TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRates) { const std::vector timing = { {"A", "sr8000", 0, 0}, {"B", "sr16000", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2); MultiEndCall multiend_call(timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupMultipleChannels) { const std::vector timing = { {"A", "sr16000_stereo", 0, 0}, {"B", "sr16000_stereo", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(1); MultiEndCall multiend_call(timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupDifferentSampleRatesAndMultipleNumChannels) { const std::vector timing = { {"A", "sr8000", 0, 0}, {"B", "sr16000_stereo", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(::testing::_)).Times(2); MultiEndCall multiend_call(timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupFirstOffsetNegative) { const std::vector timing = { {"A", "t500", -100, 0}, {"B", "t500", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupSimple) { // Accept: // A 0****..... // B .....1**** constexpr std::size_t expected_duration = kDefaultSampleRate; const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(2u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupPause) { // Accept: // A 0****....... // B .......1**** constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t500", 200, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(2u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalk) { // Accept: // A 0****.... // B ....1**** constexpr std::size_t expected_duration = kDefaultSampleRate * 0.9; const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t500", -100, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(2u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupInvalidOrder) { // Reject: // A ..0**** // B .1****. The n-th turn cannot start before the (n-1)-th one. const std::vector timing = { {"A", "t500", 200, 0}, {"B", "t500", -600, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkThree) { // Accept: // A 0****2****... // B ...1********* constexpr std::size_t expected_duration = kDefaultSampleRate * 1.3; const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t1000", -200, 0}, {"A", "t500", -800, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(3u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkNearInvalid) { // Reject: // A 0****...... // A ...1****... // B ......2**** // ^ Turn #1 overlaps with #0 which is from the same speaker. const std::vector timing = { {"A", "t500", 0, 0}, {"A", "t500", -200, 0}, {"B", "t500", -200, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupSelfCrossTalkFarInvalid) { // Reject: // A 0********* // B 1**....... // C ...2**.... // A ......3**. // ^ Turn #3 overlaps with #0 which is from the same speaker. const std::vector timing = { {"A", "t1000", 0, 0}, {"B", "t300", -1000, 0}, {"C", "t300", 0, 0}, {"A", "t300", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleValid) { // Accept: // A 0*********.. // B ..1****..... // C .......2**** constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; const std::vector timing = { {"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", 0, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(3u, multiend_call.speaker_names().size()); EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(3u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleInvalid) { // Reject: // A 0********* // B ..1****... // C ....2****. // ^ Turn #2 overlaps both with #0 and #1 (cross-talk with 3+ speakers // not permitted). const std::vector timing = { {"A", "t1000", 0, 0}, {"B", "t500", -800, 0}, {"C", "t500", -300, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkMiddleAndPause) { // Accept: // A 0*********.. // B .2****...... // C .......3**** constexpr std::size_t expected_duration = kDefaultSampleRate * 1.2; const std::vector timing = { {"A", "t1000", 0, 0}, {"B", "t500", -900, 0}, {"C", "t500", 100, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(3u, multiend_call.speaker_names().size()); EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(3u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupCrossTalkFullOverlapValid) { // Accept: // A 0**** // B 1**** const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t500", -500, 0}, }; auto mock_wavreader_factory = CreateMockWavReaderFactory(); // There is one unique audio track to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(1); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(2u, multiend_call.speaker_names().size()); EXPECT_EQ(1u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(2u, multiend_call.speaking_turns().size()); } TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequence) { // Accept: // A 0****....3****.5**. // B .....1****...4**... // C ......2**.......6**.. constexpr std::size_t expected_duration = kDefaultSampleRate * 1.9; const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0}, {"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -100, 0}, {"C", "t300", -200, 0}, }; auto mock_wavreader_factory = std::unique_ptr( new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, kDefaultMockWavReaderFactoryParamsMap)); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_TRUE(multiend_call.valid()); // Test. EXPECT_EQ(3u, multiend_call.speaker_names().size()); EXPECT_EQ(2u, multiend_call.audiotrack_readers().size()); EXPECT_EQ(7u, multiend_call.speaking_turns().size()); EXPECT_EQ(expected_duration, multiend_call.total_duration_samples()); } TEST(ConversationalSpeechTest, MultiEndCallSetupLongSequenceInvalid) { // Reject: // A 0****....3****.6** // B .....1****...4**.. // C ......2**.....5**.. // ^ Turns #4, #5 and #6 overlapping (cross-talk with 3+ // speakers not permitted). const std::vector timing = { {"A", "t500", 0, 0}, {"B", "t500", 0, 0}, {"C", "t300", -400, 0}, {"A", "t500", 0, 0}, {"B", "t300", -100, 0}, {"A", "t300", -200, 0}, {"C", "t300", -200, 0}, }; auto mock_wavreader_factory = std::unique_ptr( new MockWavReaderFactory(kDefaultMockWavReaderFactoryParams, kDefaultMockWavReaderFactoryParamsMap)); // There are two unique audio tracks to read. EXPECT_CALL(*mock_wavreader_factory, Create(_)).Times(2); conversational_speech::MultiEndCall multiend_call( timing, audiotracks_path, std::move(mock_wavreader_factory)); EXPECT_FALSE(multiend_call.valid()); } TEST(ConversationalSpeechTest, MultiEndCallWavReaderAdaptorSine) { // Parameters with which wav files are created. constexpr int duration_seconds = 5; const int sample_rates[] = {8000, 11025, 16000, 22050, 32000, 44100, 48000}; for (int sample_rate : sample_rates) { const std::string temp_filename = OutputPath() + "TempSineWavFile_" + std::to_string(sample_rate) + ".wav"; // Write wav file. const std::size_t num_samples = duration_seconds * sample_rate; MockWavReaderFactory::Params params = {sample_rate, 1u, num_samples}; CreateSineWavFile(temp_filename, params); // Load wav file and check if params match. WavReaderFactory wav_reader_factory; MockWavReaderFactory::Params expeted_params = {sample_rate, 1u, num_samples}; CheckAudioTrackParams(wav_reader_factory, temp_filename, expeted_params); // Clean up. RemoveFile(temp_filename); } } TEST(ConversationalSpeechTest, DISABLED_MultiEndCallSimulator) { // Simulated call (one character corresponding to 500 ms): // A 0*********...........2*********..... // B ...........1*********.....3********* const std::vector expected_timing = { {"A", "t5000_440.wav", 0, 0}, {"B", "t5000_880.wav", 500, 0}, {"A", "t5000_440.wav", 0, 0}, {"B", "t5000_880.wav", -2500, 0}, }; const std::size_t expected_duration_seconds = 18; // Create temporary audio track files. const int sample_rate = 16000; const std::map sine_tracks_params = { {"t5000_440.wav", {{sample_rate, 1u, sample_rate * 5}, 440.0}}, {"t5000_880.wav", {{sample_rate, 1u, sample_rate * 5}, 880.0}}, }; const std::string audiotracks_path = CreateTemporarySineAudioTracks(sine_tracks_params); // Set up the multi-end call. auto wavreader_factory = std::unique_ptr(new WavReaderFactory()); MultiEndCall multiend_call(expected_timing, audiotracks_path, std::move(wavreader_factory)); // Simulate the call. std::string output_path = JoinFilename(audiotracks_path, "output"); CreateDir(output_path); RTC_LOG(LS_VERBOSE) << "simulator output path: " << output_path; auto generated_audiotrak_pairs = conversational_speech::Simulate(multiend_call, output_path); EXPECT_EQ(2u, generated_audiotrak_pairs->size()); // Check the output. WavReaderFactory wav_reader_factory; const MockWavReaderFactory::Params expeted_params = { sample_rate, 1u, sample_rate * expected_duration_seconds}; for (const auto& it : *generated_audiotrak_pairs) { RTC_LOG(LS_VERBOSE) << "checking far/near-end for <" << it.first << ">"; CheckAudioTrackParams(wav_reader_factory, it.second.near_end, expeted_params); CheckAudioTrackParams(wav_reader_factory, it.second.far_end, expeted_params); } // Clean. EXPECT_NO_FATAL_FAILURE(DeleteFolderAndContents(audiotracks_path)); } } // namespace test } // namespace webrtc