diff --git a/BUILD.gn b/BUILD.gn index 5828a81e00..eb9ad83547 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -619,6 +619,16 @@ if (use_libfuzzer || use_afl) { } if (rtc_include_tests && !build_with_chromium) { + rtc_unittests_resources = [ "resources/reference_video_640x360_30fps.y4m" ] + + if (is_ios) { + bundle_data("rtc_unittests_bundle_data") { + testonly = true + sources = rtc_unittests_resources + outputs = [ "{{bundle_resources_dir}}/{{source_file_part}}" ] + } + } + rtc_test("rtc_unittests") { testonly = true @@ -632,6 +642,8 @@ if (rtc_include_tests && !build_with_chromium) { "api/test/metrics:metrics_unittests", "api/transport:stun_unittest", "api/video/test:rtc_api_video_unittests", + "api/video_codecs:libaom_av1_encoder_factory_test", + "api/video_codecs:simple_encoder_wrapper_unittests", "api/video_codecs/test:video_codecs_api_unittests", "api/voip:compile_all_headers", "call:fake_network_pipe_unittests", @@ -660,10 +672,16 @@ if (rtc_include_tests && !build_with_chromium) { "test/network:network_emulation_unittests", ] + data = rtc_unittests_resources + if (rtc_enable_protobuf) { deps += [ "logging:rtc_event_log_tests" ] } + if (is_ios) { + deps += [ ":rtc_unittests_bundle_data" ] + } + if (is_android) { # Do not use Chromium's launcher. native_unittests defines its own JNI_OnLoad. use_default_launcher = false diff --git a/api/DEPS b/api/DEPS index b34925e454..eecde25718 100644 --- a/api/DEPS +++ b/api/DEPS @@ -205,6 +205,19 @@ specific_include_rules = { "+modules/video_coding", ], + "video_encoder_factory_interface\.h": [ + "+rtc_base/numerics", + ], + + "video_encoder_interface\.h": [ + "+rtc_base/numerics", + ], + + "simple_encoder_wrapper\.h": [ + "+common_video", + "+modules", + ], + "video_decoder_factory_template.*\.h": [ "+modules/video_coding", ], diff --git a/api/video_codecs/BUILD.gn b/api/video_codecs/BUILD.gn index 05de6a3662..f994d5abea 100644 --- a/api/video_codecs/BUILD.gn +++ b/api/video_codecs/BUILD.gn @@ -280,6 +280,125 @@ rtc_source_set("video_decoder_factory_template_dav1d_adapter") { ] } +rtc_source_set("video_encoding_general") { + public = [ "video_encoding_general.h" ] +} + +rtc_source_set("video_encoder_interface") { + public = [ "video_encoder_interface.h" ] + + deps = [ + ":video_encoding_general", + "../../api/units:data_rate", + "../../api/units:time_delta", + "../../api/units:timestamp", + "../../api/video:encoded_image", + "../../api/video:resolution", + "../../api/video:video_frame", + "../../api/video_codecs:video_codecs_api", + "../../rtc_base:rtc_numerics", + ] + + absl_deps = [ + "//third_party/abseil-cpp/absl/functional:any_invocable", + "//third_party/abseil-cpp/absl/types:optional", + "//third_party/abseil-cpp/absl/types:variant", + ] +} + +rtc_source_set("video_encoder_factory_interface") { + public = [ "video_encoder_factory_interface.h" ] + + deps = [ + ":video_encoder_interface", + ":video_encoding_general", + "../../api/units:time_delta", + "../../api/video:resolution", + "../../rtc_base:rtc_numerics", + ] + + absl_deps = [ + "//third_party/abseil-cpp/absl/types:optional", + "//third_party/abseil-cpp/absl/types:variant", + ] +} + +rtc_library("simple_encoder_wrapper") { + sources = [ + "simple_encoder_wrapper.cc", + "simple_encoder_wrapper.h", + ] + + deps = [ + ":video_encoder_factory_interface", + ":video_encoder_interface", + "../../api/units:data_rate", + "../../api/video_codecs:scalability_mode", + "../../api/video_codecs:scalability_mode_helper", + "../../common_video/generic_frame_descriptor:generic_frame_descriptor", + "../../modules/video_coding/svc:scalability_structures", + "../../rtc_base:logging", + ] + + absl_deps = [ + "//third_party/abseil-cpp/absl/algorithm:container", + "//third_party/abseil-cpp/absl/functional:any_invocable", + ] +} + +rtc_library("simple_encoder_wrapper_unittests") { + testonly = true + + sources = [ "simple_encoder_wrapper_unittests.cc" ] + + deps = [ + ":simple_encoder_wrapper", + ":video_encoder_factory_interface", + ":video_encoder_interface", + "../../api/video:video_frame", + "../../api/video_codecs:libaom_av1_encoder_factory", + "../../test:fileutils", + "../../test:test_support", + "../../test:video_test_support", + ] +} + +rtc_library("libaom_av1_encoder_factory") { + sources = [ + "libaom_av1_encoder_factory.cc", + "libaom_av1_encoder_factory.h", + ] + + deps = [ + ":video_encoder_factory_interface", + ":video_encoder_interface", + "../../api/units:time_delta", + "../../rtc_base:logging", + "//third_party/libaom", + ] + + absl_deps = [ "//third_party/abseil-cpp/absl/algorithm:container" ] +} + +rtc_library("libaom_av1_encoder_factory_test") { + testonly = true + sources = [ "libaom_av1_encoder_factory_test.cc" ] + data = [ "../../resources/reference_video_640x360_30fps.y4m" ] + + deps = [ + ":libaom_av1_encoder_factory", + ":video_encoder_interface", + "../../api/video:video_frame", + "../../api/video_codecs:video_codecs_api", + "../../common_video:common_video", + "../../modules/video_coding/codecs/av1:dav1d_decoder", + "../../rtc_base:logging", + "../../test:fileutils", + "../../test:test_support", + "../../test:video_test_support", + ] +} + rtc_library("vp8_temporal_layers_factory") { visibility = [ "*" ] allow_poison = [ "software_video_codecs" ] diff --git a/api/video_codecs/libaom_av1_encoder_factory.cc b/api/video_codecs/libaom_av1_encoder_factory.cc new file mode 100644 index 0000000000..eab6eaefe8 --- /dev/null +++ b/api/video_codecs/libaom_av1_encoder_factory.cc @@ -0,0 +1,842 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "api/video_codecs/libaom_av1_encoder_factory.h" + +#include +#include +#include +#include +#include + +#include "absl/algorithm/container.h" +#include "api/video_codecs/video_encoder_interface.h" +#include "rtc_base/logging.h" +#include "third_party/libaom/source/libaom/aom/aom_codec.h" +#include "third_party/libaom/source/libaom/aom/aom_encoder.h" +#include "third_party/libaom/source/libaom/aom/aomcx.h" + +#define SET_OR_DO_ERROR_CALLBACK_AND_RETURN(param_id, param_value) \ + do { \ + if (!SetEncoderControlParameters(&ctx_, param_id, param_value)) { \ + encode_result_callback({}); \ + return; \ + } \ + } while (0) + +#define SET_OR_RETURN_FALSE(param_id, param_value) \ + do { \ + if (!SetEncoderControlParameters(&ctx_, param_id, param_value)) { \ + return false; \ + } \ + } while (0) + +namespace webrtc { + +using Cbr = VideoEncoderInterface::FrameEncodeSettings::Cbr; +using Cqp = VideoEncoderInterface::FrameEncodeSettings::Cqp; +using aom_img_ptr = std::unique_ptr; + +namespace { +// MaxQp defined here: +// http://google3/third_party/libaom/git_root/av1/av1_cx_iface.c;l=3510;rcl=527067478 +constexpr int kMaxQp = 63; +constexpr int kNumBuffers = 8; +constexpr int kMaxReferences = 3; +constexpr int kMinEffortLevel = -2; +constexpr int kMaxEffortLevel = 2; +constexpr int kMaxSpatialLayersWtf = 4; +constexpr int kMaxTemporalLayers = 4; +constexpr int kRtpTicksPerSecond = 90000; +constexpr std::array kSupportedInputFormats = { + VideoFrameBuffer::Type::kI420, VideoFrameBuffer::Type::kNV12}; + +constexpr std::array kSupportedScalingFactors = { + {{8, 1}, {4, 1}, {2, 1}, {1, 1}, {1, 2}, {1, 4}, {1, 8}}}; + +absl::optional GetScalingFactor(const Resolution& from, + const Resolution& to) { + auto it = absl::c_find_if(kSupportedScalingFactors, [&](const Rational& r) { + return (from.width * r.numerator / r.denominator) == to.width && + (from.height * r.numerator / r.denominator) == to.height; + }); + + if (it != kSupportedScalingFactors.end()) { + return *it; + } + + return {}; +} + +class LibaomAv1Encoder : public VideoEncoderInterface { + public: + LibaomAv1Encoder() = default; + ~LibaomAv1Encoder() override; + + bool InitEncode( + const VideoEncoderFactoryInterface::StaticEncoderSettings& settings, + const std::map& encoder_specific_settings); + + void Encode(rtc::scoped_refptr frame_buffer, + const TemporalUnitSettings& tu_settings, + const std::vector& frame_settings, + EncodeResultCallback encode_result_callback) override; + + private: + aom_img_ptr image_to_encode_ = aom_img_ptr(nullptr, aom_img_free); + aom_codec_ctx_t ctx_; + aom_codec_enc_cfg_t cfg_; + + absl::optional current_content_type_; + absl::optional current_effort_level_; + int max_number_of_threads_; + std::array, 8> last_resolution_in_buffer_; +}; + +template +bool SetEncoderControlParameters(aom_codec_ctx_t* ctx, int id, T value) { + aom_codec_err_t error_code = aom_codec_control(ctx, id, value); + if (error_code != AOM_CODEC_OK) { + RTC_LOG(LS_WARNING) << "aom_codec_control returned " << error_code + << " with id: " << id << "."; + } + return error_code == AOM_CODEC_OK; +} + +LibaomAv1Encoder::~LibaomAv1Encoder() { + aom_codec_destroy(&ctx_); +} + +bool LibaomAv1Encoder::InitEncode( + const VideoEncoderFactoryInterface::StaticEncoderSettings& settings, + const std::map& encoder_specific_settings) { + if (!encoder_specific_settings.empty()) { + RTC_LOG(LS_ERROR) + << "libaom av1 encoder accepts no encoder specific settings"; + return false; + } + + if (aom_codec_err_t ret = aom_codec_enc_config_default( + aom_codec_av1_cx(), &cfg_, AOM_USAGE_REALTIME); + ret != AOM_CODEC_OK) { + RTC_LOG(LS_ERROR) << "aom_codec_enc_config_default returned " << ret; + return false; + } + + max_number_of_threads_ = settings.max_number_of_threads; + + // The encode resolution is set dynamically for each call to `Encode`, but for + // `aom_codec_enc_init` to not fail we set it here as well. + cfg_.g_w = settings.max_encode_dimensions.width; + cfg_.g_h = settings.max_encode_dimensions.height; + cfg_.g_timebase.num = 1; + // TD: does 90khz timebase make sense, use microseconds instead maybe? + cfg_.g_timebase.den = kRtpTicksPerSecond; + cfg_.g_input_bit_depth = settings.encoding_format.bit_depth; + cfg_.kf_mode = AOM_KF_DISABLED; + // TD: rc_undershoot_pct and rc_overshoot_pct should probably be removed. + cfg_.rc_undershoot_pct = 50; + cfg_.rc_overshoot_pct = 50; + auto* cbr = + absl::get_if( + &settings.rc_mode); + cfg_.rc_buf_initial_sz = cbr ? cbr->target_buffer_size.ms() : 600; + cfg_.rc_buf_optimal_sz = cbr ? cbr->target_buffer_size.ms() : 600; + cfg_.rc_buf_sz = cbr ? cbr->max_buffer_size.ms() : 1000; + cfg_.g_usage = AOM_USAGE_REALTIME; + cfg_.g_pass = AOM_RC_ONE_PASS; + cfg_.g_lag_in_frames = 0; + cfg_.g_error_resilient = 0; + cfg_.rc_end_usage = cbr ? AOM_CBR : AOM_Q; + + if (aom_codec_err_t ret = + aom_codec_enc_init(&ctx_, aom_codec_av1_cx(), &cfg_, /*flags=*/0); + ret != AOM_CODEC_OK) { + RTC_LOG(LS_ERROR) << "aom_codec_enc_init returned " << ret; + return false; + } + + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_CDEF, 1); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_TPL_MODEL, 0); + SET_OR_RETURN_FALSE(AV1E_SET_DELTAQ_MODE, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_ORDER_HINT, 0); + SET_OR_RETURN_FALSE(AV1E_SET_AQ_MODE, 3); + SET_OR_RETURN_FALSE(AOME_SET_MAX_INTRA_BITRATE_PCT, 300); + SET_OR_RETURN_FALSE(AV1E_SET_COEFF_COST_UPD_FREQ, 3); + SET_OR_RETURN_FALSE(AV1E_SET_MODE_COST_UPD_FREQ, 3); + SET_OR_RETURN_FALSE(AV1E_SET_MV_COST_UPD_FREQ, 3); + SET_OR_RETURN_FALSE(AV1E_SET_ROW_MT, 1); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_OBMC, 0); + SET_OR_RETURN_FALSE(AV1E_SET_NOISE_SENSITIVITY, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_WARPED_MOTION, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_REF_FRAME_MVS, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_CFL_INTRA, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_SMOOTH_INTRA, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_ANGLE_DELTA, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_FILTER_INTRA, 0); + SET_OR_RETURN_FALSE(AV1E_SET_INTRA_DEFAULT_TX_ONLY, 1); + SET_OR_RETURN_FALSE(AV1E_SET_DISABLE_TRELLIS_QUANT, 1); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_DIST_WTD_COMP, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_DIFF_WTD_COMP, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_DUAL_FILTER, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_INTERINTRA_COMP, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_INTERINTRA_WEDGE, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_INTRA_EDGE_FILTER, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_INTRABC, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_MASKED_COMP, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_PAETH_INTRA, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_QM, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_RECT_PARTITIONS, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_RESTORATION, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_SMOOTH_INTERINTRA, 0); + SET_OR_RETURN_FALSE(AV1E_SET_ENABLE_TX64, 0); + SET_OR_RETURN_FALSE(AV1E_SET_MAX_REFERENCE_FRAMES, 3); + + return true; +} + +struct ThreadTilesAndSuperblockSizeInfo { + int num_threads; + int exp_tile_rows; + int exp_tile_colums; + aom_superblock_size_t superblock_size; +}; + +ThreadTilesAndSuperblockSizeInfo GetThreadingTilesAndSuperblockSize( + int width, + int height, + int max_number_of_threads) { + ThreadTilesAndSuperblockSizeInfo res; + const int num_pixels = width * height; + if (num_pixels >= 1920 * 1080 && max_number_of_threads > 8) { + res.num_threads = 8; + res.exp_tile_rows = 2; + res.exp_tile_colums = 1; + } else if (num_pixels >= 640 * 360 && max_number_of_threads > 4) { + res.num_threads = 4; + res.exp_tile_rows = 1; + res.exp_tile_colums = 1; + } else if (num_pixels >= 320 * 180 && max_number_of_threads > 2) { + res.num_threads = 2; + res.exp_tile_rows = 1; + res.exp_tile_colums = 0; + } else { + res.num_threads = 1; + res.exp_tile_rows = 0; + res.exp_tile_colums = 0; + } + + if (res.num_threads > 4 && num_pixels >= 960 * 540) { + res.superblock_size = AOM_SUPERBLOCK_SIZE_64X64; + } else { + res.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC; + } + + RTC_LOG(LS_WARNING) << __FUNCTION__ << " res.num_threads=" << res.num_threads + << " res.exp_tile_rows=" << res.exp_tile_rows + << " res.exp_tile_colums=" << res.exp_tile_colums + << " res.superblock_size=" << res.superblock_size; + + return res; +} + +bool ValidateEncodeParams( + const webrtc::VideoFrameBuffer& frame_buffer, + const VideoEncoderInterface::TemporalUnitSettings& tu_settings, + const std::vector& + frame_settings, + const std::array, 8>& last_resolution_in_buffer, + aom_rc_mode rc_mode) { + if (frame_settings.empty()) { + RTC_LOG(LS_ERROR) << "No frame settings provided."; + return false; + } + + auto in_range = [](int low, int high, int val) { + return low <= val && val < high; + }; + + if (!in_range(kMinEffortLevel, kMaxEffortLevel + 1, + tu_settings.effort_level)) { + RTC_LOG(LS_ERROR) << "Unsupported effort level " + << tu_settings.effort_level; + return false; + } + + for (size_t i = 0; i < frame_settings.size(); ++i) { + const VideoEncoderInterface::FrameEncodeSettings& settings = + frame_settings[i]; + + if (!in_range(0, kMaxSpatialLayersWtf, settings.spatial_id)) { + RTC_LOG(LS_ERROR) << "invalid spatial id " << settings.spatial_id; + return false; + } + + if (!in_range(0, kMaxTemporalLayers, settings.temporal_id)) { + RTC_LOG(LS_ERROR) << "invalid temporal id " << settings.temporal_id; + return false; + } + + if ((settings.frame_type == FrameType::kKeyframe || + settings.frame_type == FrameType::kStartFrame) && + !settings.reference_buffers.empty()) { + RTC_LOG(LS_ERROR) << "Reference buffers can not be used for keyframes."; + return false; + } + + if ((settings.frame_type == FrameType::kKeyframe || + settings.frame_type == FrameType::kStartFrame) && + !settings.update_buffer) { + RTC_LOG(LS_ERROR) + << "Buffer to update must be specified for keyframe/startframe"; + return false; + } + + if (settings.update_buffer && + !in_range(0, kNumBuffers, *settings.update_buffer)) { + RTC_LOG(LS_ERROR) << "Invalid update buffer id."; + return false; + } + + if (settings.reference_buffers.size() > kMaxReferences) { + RTC_LOG(LS_ERROR) << "Too many referenced buffers."; + return false; + } + + for (size_t j = 0; j < settings.reference_buffers.size(); ++j) { + if (!in_range(0, kNumBuffers, settings.reference_buffers[j])) { + RTC_LOG(LS_ERROR) << "Invalid reference buffer id."; + return false; + } + + // Figure out which frame resolution a certain buffer will hold when the + // frame described by `settings` is encoded. + absl::optional referenced_resolution; + bool keyframe_on_previous_layer = false; + + // Will some other frame in this temporal unit update the buffer? + for (size_t k = 0; k < i; ++k) { + if (frame_settings[k].frame_type == FrameType::kKeyframe) { + keyframe_on_previous_layer = true; + referenced_resolution.reset(); + } + if (frame_settings[k].update_buffer == settings.reference_buffers[j]) { + referenced_resolution = frame_settings[k].resolution; + } + } + + // Not updated by another frame in the temporal unit, what is the + // resolution of the last frame stored into that buffer? + if (!referenced_resolution && !keyframe_on_previous_layer) { + referenced_resolution = + last_resolution_in_buffer[settings.reference_buffers[j]]; + } + + if (!referenced_resolution) { + RTC_LOG(LS_ERROR) << "Referenced buffer holds no frame."; + return false; + } + + if (!GetScalingFactor(*referenced_resolution, settings.resolution)) { + RTC_LOG(LS_ERROR) + << "Required resolution scaling factor not supported."; + return false; + } + + for (size_t l = i + 1; l < settings.reference_buffers.size(); ++l) { + if (settings.reference_buffers[i] == settings.reference_buffers[l]) { + RTC_LOG(LS_ERROR) << "Duplicate reference buffer specified."; + return false; + } + } + } + + if ((rc_mode == AOM_CBR && + absl::holds_alternative(settings.rate_options)) || + (rc_mode == AOM_Q && + absl::holds_alternative(settings.rate_options))) { + RTC_LOG(LS_ERROR) << "Invalid rate options, encoder configured with " + << (rc_mode == AOM_CBR ? "AOM_CBR" : "AOM_Q"); + return false; + } + + for (size_t j = i + 1; j < frame_settings.size(); ++j) { + if (settings.spatial_id >= frame_settings[j].spatial_id) { + RTC_LOG(LS_ERROR) << "Frame spatial id specified out of order."; + return false; + } + } + } + + return true; +} + +void PrepareInputImage(const VideoFrameBuffer& input_buffer, + aom_img_ptr& out_aom_image) { + aom_img_fmt_t input_format; + switch (input_buffer.type()) { + case VideoFrameBuffer::Type::kI420: + input_format = AOM_IMG_FMT_I420; + break; + case VideoFrameBuffer::Type::kNV12: + input_format = AOM_IMG_FMT_NV12; + break; + default: + RTC_CHECK_NOTREACHED(); + return; + } + + if (!out_aom_image || out_aom_image->fmt != input_format || + static_cast(out_aom_image->w) != input_buffer.width() || + static_cast(out_aom_image->h) != input_buffer.height()) { + out_aom_image.reset( + aom_img_wrap(/*img=*/nullptr, input_format, input_buffer.width(), + input_buffer.height(), /*align=*/1, /*img_data=*/nullptr)); + + RTC_LOG(LS_WARNING) << __FUNCTION__ << " input_format=" << input_format + << " input_buffer.width()=" << input_buffer.width() + << " input_buffer.height()=" << input_buffer.height() + << " w=" << out_aom_image->w + << " h=" << out_aom_image->h + << " d_w=" << out_aom_image->d_w + << " d_h=" << out_aom_image->d_h + << " r_w=" << out_aom_image->r_w + << " r_h=" << out_aom_image->r_h; + } + + if (input_format == AOM_IMG_FMT_I420) { + const I420BufferInterface* i420_buffer = input_buffer.GetI420(); + RTC_DCHECK(i420_buffer); + out_aom_image->planes[AOM_PLANE_Y] = + const_cast(i420_buffer->DataY()); + out_aom_image->planes[AOM_PLANE_U] = + const_cast(i420_buffer->DataU()); + out_aom_image->planes[AOM_PLANE_V] = + const_cast(i420_buffer->DataV()); + out_aom_image->stride[AOM_PLANE_Y] = i420_buffer->StrideY(); + out_aom_image->stride[AOM_PLANE_U] = i420_buffer->StrideU(); + out_aom_image->stride[AOM_PLANE_V] = i420_buffer->StrideV(); + } else { + const NV12BufferInterface* nv12_buffer = input_buffer.GetNV12(); + RTC_DCHECK(nv12_buffer); + out_aom_image->planes[AOM_PLANE_Y] = + const_cast(nv12_buffer->DataY()); + out_aom_image->planes[AOM_PLANE_U] = + const_cast(nv12_buffer->DataUV()); + out_aom_image->planes[AOM_PLANE_V] = nullptr; + out_aom_image->stride[AOM_PLANE_Y] = nv12_buffer->StrideY(); + out_aom_image->stride[AOM_PLANE_U] = nv12_buffer->StrideUV(); + out_aom_image->stride[AOM_PLANE_V] = 0; + } +} + +aom_svc_ref_frame_config_t GetSvcRefFrameConfig( + const VideoEncoderInterface::FrameEncodeSettings& settings) { + // Buffer alias to use for each position. In particular when there are two + // buffers being used, prefer to alias them as LAST and GOLDEN, since the AV1 + // bitstream format has dedicated fields for them. See last_frame_idx and + // golden_frame_idx in the av1 spec + // https://aomediacodec.github.io/av1-spec/av1-spec.pdf. + + // Libaom is also compiled for RTC, which limits the number of references to + // at most three, and they must be aliased as LAST, GOLDEN and ALTREF. Also + // note that libaom favors LAST the most, and GOLDEN second most, so buffers + // should be specified in order of how useful they are for prediction. Libaom + // could be updated to make LAST, GOLDEN and ALTREF equivalent, but that is + // not a priority for now. All aliases can be used to update buffers. + // TD: Automatically select LAST, GOLDEN and ALTREF depending on previous + // buffer usage. + static constexpr int kPreferedAlias[] = {0, // LAST + 3, // GOLDEN + 6, // ALTREF + 1, 2, 4, 5}; + + aom_svc_ref_frame_config_t ref_frame_config = {}; + + int alias_index = 0; + if (!settings.reference_buffers.empty()) { + for (size_t i = 0; i < settings.reference_buffers.size(); ++i) { + ref_frame_config.ref_idx[kPreferedAlias[alias_index]] = + settings.reference_buffers[i]; + ref_frame_config.reference[kPreferedAlias[alias_index]] = 1; + alias_index++; + } + + // Delta frames must not alias unused buffers, and since start frames only + // update some buffers it is not safe to leave unused aliases to simply + // point to buffer 0. + for (size_t i = settings.reference_buffers.size(); + i < std::size(ref_frame_config.ref_idx); ++i) { + ref_frame_config.ref_idx[kPreferedAlias[i]] = + settings.reference_buffers.back(); + } + } + + if (settings.update_buffer) { + if (!absl::c_linear_search(settings.reference_buffers, + *settings.update_buffer)) { + ref_frame_config.ref_idx[kPreferedAlias[alias_index]] = + *settings.update_buffer; + alias_index++; + } + ref_frame_config.refresh[*settings.update_buffer] = 1; + } + + char buf[256]; + rtc::SimpleStringBuilder sb(buf); + sb << " spatial_id=" << settings.spatial_id; + sb << " ref_idx=[ "; + for (auto r : ref_frame_config.ref_idx) { + sb << r << " "; + } + sb << "] reference=[ "; + for (auto r : ref_frame_config.reference) { + sb << r << " "; + } + sb << "] refresh=[ "; + for (auto r : ref_frame_config.refresh) { + sb << r << " "; + } + sb << "]"; + + RTC_LOG(LS_WARNING) << __FUNCTION__ << sb.str(); + + return ref_frame_config; +} + +aom_svc_params_t GetSvcParams( + const webrtc::VideoFrameBuffer& frame_buffer, + const std::vector& + frame_settings) { + aom_svc_params_t svc_params = {}; + svc_params.number_spatial_layers = frame_settings.back().spatial_id + 1; + svc_params.number_temporal_layers = kMaxTemporalLayers; + + // TD: What about svc_params.framerate_factor? + // If `framerate_factors` are left at 0 then configured bitrate values will + // not be picked up by libaom. + for (int tid = 0; tid < svc_params.number_temporal_layers; ++tid) { + svc_params.framerate_factor[tid] = 1; + } + + // If the scaling factor is left at zero for unused layers a division by zero + // will happen inside libaom, default all layers to one. + for (int sid = 0; sid < svc_params.number_spatial_layers; ++sid) { + svc_params.scaling_factor_num[sid] = 1; + svc_params.scaling_factor_den[sid] = 1; + } + + for (const VideoEncoderInterface::FrameEncodeSettings& settings : + frame_settings) { + absl::optional scaling_factor = GetScalingFactor( + {frame_buffer.width(), frame_buffer.height()}, settings.resolution); + RTC_CHECK(scaling_factor); + svc_params.scaling_factor_num[settings.spatial_id] = + scaling_factor->numerator; + svc_params.scaling_factor_den[settings.spatial_id] = + scaling_factor->denominator; + + const int flat_layer_id = + settings.spatial_id * svc_params.number_temporal_layers + + settings.temporal_id; + + RTC_LOG(LS_WARNING) << __FUNCTION__ << " flat_layer_id=" << flat_layer_id + << " num=" + << svc_params.scaling_factor_num[settings.spatial_id] + << " den=" + << svc_params.scaling_factor_den[settings.spatial_id]; + + absl::visit( + [&](auto&& arg) { + using T = std::decay_t; + if constexpr (std::is_same_v) { + // Libaom calculates the total bitrate across all spatial layers by + // summing the bitrate of the last temporal layer in each spatial + // layer. This means the bitrate for the top temporal layer always + // has to be set even if that temporal layer is not being encoded. + const int last_temporal_layer_in_spatial_layer_id = + settings.spatial_id * svc_params.number_temporal_layers + + (kMaxTemporalLayers - 1); + svc_params + .layer_target_bitrate[last_temporal_layer_in_spatial_layer_id] = + arg.target_bitrate.kbps(); + + svc_params.layer_target_bitrate[flat_layer_id] = + arg.target_bitrate.kbps(); + // When libaom is configured with `AOM_CBR` it will still limit QP + // to stay between `min_quantizers` and `max_quantizers'. Set + // `max_quantizers` to max QP to avoid the encoder overshooting. + svc_params.max_quantizers[flat_layer_id] = kMaxQp; + svc_params.min_quantizers[flat_layer_id] = 0; + } else if constexpr (std::is_same_v) { + // When libaom is configured with `AOM_Q` it will still look at the + // `layer_target_bitrate` to determine whether the layer is disabled + // or not. Set `layer_target_bitrate` to 1 so that libaom knows the + // layer is active. + svc_params.layer_target_bitrate[flat_layer_id] = 1; + svc_params.max_quantizers[flat_layer_id] = arg.target_qp; + svc_params.min_quantizers[flat_layer_id] = arg.target_qp; + RTC_LOG(LS_WARNING) << __FUNCTION__ << " svc_params.qp[" + << flat_layer_id << "]=" << arg.target_qp; + // TD: Does libaom look at both max and min? Shouldn't it just be + // one of them + } + }, + settings.rate_options); + } + + char buf[512]; + rtc::SimpleStringBuilder sb(buf); + sb << "GetSvcParams" << " layer bitrates kbps"; + for (int s = 0; s < svc_params.number_spatial_layers; ++s) { + sb << " S" << s << "=[ "; + for (int t = 0; t < svc_params.number_temporal_layers; ++t) { + int id = s * svc_params.number_temporal_layers + t; + sb << "T" << t << "=" << svc_params.layer_target_bitrate[id] << " "; + } + sb << "]"; + } + + RTC_LOG(LS_WARNING) << sb.str(); + + return svc_params; +} + +void LibaomAv1Encoder::Encode( + rtc::scoped_refptr frame_buffer, + const TemporalUnitSettings& tu_settings, + const std::vector& frame_settings, + EncodeResultCallback encode_result_callback) { + if (!ValidateEncodeParams(*frame_buffer, tu_settings, frame_settings, + last_resolution_in_buffer_, cfg_.rc_end_usage)) { + encode_result_callback({}); + return; + } + + if (tu_settings.effort_level != current_effort_level_) { + // For RTC we use speed level 6 to 10, with 8 being the default. Note that + // low effort means higher speed. + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AOME_SET_CPUUSED, + 8 - tu_settings.effort_level); + current_effort_level_ = tu_settings.effort_level; + } + + if (current_content_type_ != tu_settings.content_hint) { + if (tu_settings.content_hint == VideoCodecMode::kScreensharing) { + // TD: Set speed 11? + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_TUNE_CONTENT, + AOM_CONTENT_SCREEN); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_ENABLE_PALETTE, 1); + } else { + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_TUNE_CONTENT, + AOM_CONTENT_DEFAULT); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_ENABLE_PALETTE, 0); + } + current_content_type_ = tu_settings.content_hint; + } + + if (cfg_.rc_end_usage == AOM_CBR) { + DataRate accum_rate = DataRate::Zero(); + for (const FrameEncodeSettings& settings : frame_settings) { + accum_rate += absl::get(settings.rate_options).target_bitrate; + } + cfg_.rc_target_bitrate = accum_rate.kbps(); + RTC_LOG(LS_WARNING) << __FUNCTION__ + << " cfg_.rc_target_bitrate=" << cfg_.rc_target_bitrate; + } + + if (static_cast(cfg_.g_w) != frame_buffer->width() || + static_cast(cfg_.g_h) != frame_buffer->height()) { + RTC_LOG(LS_WARNING) << __FUNCTION__ << " resolution changed from " + << cfg_.g_w << "x" << cfg_.g_h << " to " + << frame_buffer->width() << "x" + << frame_buffer->height(); + ThreadTilesAndSuperblockSizeInfo ttsbi = GetThreadingTilesAndSuperblockSize( + frame_buffer->width(), frame_buffer->height(), max_number_of_threads_); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_SUPERBLOCK_SIZE, + ttsbi.superblock_size); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_TILE_ROWS, + ttsbi.exp_tile_rows); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_TILE_COLUMNS, + ttsbi.exp_tile_colums); + cfg_.g_threads = ttsbi.num_threads; + cfg_.g_w = frame_buffer->width(); + cfg_.g_h = frame_buffer->height(); + } + + PrepareInputImage(*frame_buffer, image_to_encode_); + + // The bitrates caluclated internally in libaom when `AV1E_SET_SVC_PARAMS` is + // called depends on the currently configured `cfg_.rc_target_bitrate`. If the + // total target bitrate is not updated first a division by zero could happen. + if (aom_codec_err_t ret = aom_codec_enc_config_set(&ctx_, &cfg_); + ret != AOM_CODEC_OK) { + RTC_LOG(LS_ERROR) << "aom_codec_enc_config_set returned " << ret; + encode_result_callback({}); + return; + } + aom_svc_params_t svc_params = GetSvcParams(*frame_buffer, frame_settings); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_SVC_PARAMS, &svc_params); + + // The libaom AV1 encoder requires that `aom_codec_encode` is called for + // every spatial layer, even if no frame should be encoded for that layer. + std::array + settings_for_spatial_id; + settings_for_spatial_id.fill(nullptr); + FrameEncodeSettings settings_for_unused_layer; + for (const FrameEncodeSettings& settings : frame_settings) { + settings_for_spatial_id[settings.spatial_id] = &settings; + } + + for (int sid = frame_settings[0].spatial_id; + sid < svc_params.number_spatial_layers; ++sid) { + const bool layer_enabled = settings_for_spatial_id[sid] != nullptr; + const FrameEncodeSettings& settings = layer_enabled + ? *settings_for_spatial_id[sid] + : settings_for_unused_layer; + + aom_svc_layer_id_t layer_id = { + .spatial_layer_id = sid, + .temporal_layer_id = settings.temporal_id, + }; + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_SVC_LAYER_ID, &layer_id); + aom_svc_ref_frame_config_t ref_config = GetSvcRefFrameConfig(settings); + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AV1E_SET_SVC_REF_FRAME_CONFIG, + &ref_config); + + // TD: Why does the libaom have both `encode_timestamp_` and `duration`? + // TD: Duration can't be zero, what does it matter when the layer is + // not being encoded? + TimeDelta duration = TimeDelta::Millis(1); + if (layer_enabled) { + if (const Cbr* cbr = absl::get_if(&settings.rate_options)) { + duration = cbr->duration; + } else { + // TD: What should duration be when Cqp is used? + duration = TimeDelta::Millis(1); + } + } + + RTC_LOG(LS_WARNING) + << __FUNCTION__ << " timestamp=" + << (tu_settings.presentation_timestamp.ms() * kRtpTicksPerSecond / 1000) + << " duration=" << (duration.ms() * kRtpTicksPerSecond / 1000) + << " type=" + << (settings.frame_type == FrameType::kKeyframe ? "key" : "delta"); + aom_codec_err_t ret = aom_codec_encode( + &ctx_, &*image_to_encode_, tu_settings.presentation_timestamp.ms() * 90, + duration.ms() * 90, + settings.frame_type == FrameType::kKeyframe ? AOM_EFLAG_FORCE_KF : 0); + if (ret != AOM_CODEC_OK) { + RTC_LOG(LS_WARNING) << "aom_codec_encode returned " << ret; + encode_result_callback({}); + return; + } + + if (!layer_enabled) { + continue; + } + + if (settings.frame_type == FrameType::kKeyframe) { + last_resolution_in_buffer_ = {}; + } + + if (settings.update_buffer) { + last_resolution_in_buffer_[*settings.update_buffer] = settings.resolution; + } + + EncodedData result; + aom_codec_iter_t iter = nullptr; + while (const aom_codec_cx_pkt_t* pkt = + aom_codec_get_cx_data(&ctx_, &iter)) { + if (pkt->kind == AOM_CODEC_CX_FRAME_PKT && pkt->data.frame.sz > 0) { + SET_OR_DO_ERROR_CALLBACK_AND_RETURN(AOME_GET_LAST_QUANTIZER_64, + &result.encoded_qp); + result.frame_type = pkt->data.frame.flags & AOM_EFLAG_FORCE_KF + ? FrameType::kKeyframe + : FrameType::kDeltaFrame; + result.bitstream_data = EncodedImageBuffer::Create( + static_cast(pkt->data.frame.buf), pkt->data.frame.sz); + result.spatial_id = sid; + result.referenced_buffers = settings.reference_buffers; + break; + } + } + + if (result.bitstream_data == nullptr) { + // TD: How should error callbacks be handled, only call once? + encode_result_callback({}); + return; + } else { + encode_result_callback(result); + } + } +} +} // namespace + +std::string LibaomAv1EncoderFactory::CodecName() const { + return "AV1"; +} + +// TD: it should also possible to expose SW/HW/driver version. +std::string LibaomAv1EncoderFactory::ImplementationName() const { + return "Libaom"; +} + +std::map LibaomAv1EncoderFactory::CodecSpecifics() + const { + return {}; +} + +VideoEncoderFactoryInterface::Capabilities +LibaomAv1EncoderFactory::GetEncoderCapabilities() const { + return { + .prediction_constraints = + {.num_buffers = kNumBuffers, + .max_references = kMaxReferences, + .max_temporal_layers = kMaxTemporalLayers, + .buffer_space_type = VideoEncoderFactoryInterface::Capabilities:: + PredictionConstraints::BufferSpaceType::kSingleKeyframe, + .max_spatial_layers = kMaxSpatialLayersWtf, + .scaling_factors = {kSupportedScalingFactors.begin(), + kSupportedScalingFactors.end()}, + .supported_frame_types = {FrameType::kKeyframe, + FrameType::kStartFrame, + FrameType::kDeltaFrame}}, + .input_constraints = { + .min = {.width = 64, .height = 36}, + .max = {.width = 3840, .height = 2160}, + .pixel_alignment = 1, + .input_formats = {kSupportedInputFormats.begin(), + kSupportedInputFormats.end()}, + }, + .encoding_formats = {{.sub_sampling = EncodingFormat::k420, + .bit_depth = 8}}, + .rate_control = + {.qp_range = {0, kMaxQp}, + .rc_modes = {VideoEncoderFactoryInterface::RateControlMode::kCbr, + VideoEncoderFactoryInterface::RateControlMode::kCqp}}, + .performance = {.min_max_effort_level = {kMinEffortLevel, + kMaxEffortLevel}}, + }; +} + +std::unique_ptr LibaomAv1EncoderFactory::CreateEncoder( + const StaticEncoderSettings& settings, + const std::map& encoder_specific_settings) { + auto encoder = std::make_unique(); + if (!encoder->InitEncode(settings, encoder_specific_settings)) { + return nullptr; + } + return encoder; +} + +} // namespace webrtc diff --git a/api/video_codecs/libaom_av1_encoder_factory.h b/api/video_codecs/libaom_av1_encoder_factory.h new file mode 100644 index 0000000000..df2abb2a3b --- /dev/null +++ b/api/video_codecs/libaom_av1_encoder_factory.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef API_VIDEO_CODECS_LIBAOM_AV1_ENCODER_FACTORY_H_ +#define API_VIDEO_CODECS_LIBAOM_AV1_ENCODER_FACTORY_H_ + +#include +#include +#include +#include + +#include "api/video_codecs/video_encoder_factory_interface.h" + +namespace webrtc { +class LibaomAv1EncoderFactory final : VideoEncoderFactoryInterface { + public: + std::string CodecName() const override; + std::string ImplementationName() const override; + std::map CodecSpecifics() const override; + + Capabilities GetEncoderCapabilities() const override; + std::unique_ptr CreateEncoder( + const StaticEncoderSettings& settings, + const std::map& encoder_specific_settings) + override; +}; +} // namespace webrtc +#endif // API_VIDEO_CODECS_LIBAOM_AV1_ENCODER_FACTORY_H_ diff --git a/api/video_codecs/libaom_av1_encoder_factory_test.cc b/api/video_codecs/libaom_av1_encoder_factory_test.cc new file mode 100644 index 0000000000..eacb5a7a85 --- /dev/null +++ b/api/video_codecs/libaom_av1_encoder_factory_test.cc @@ -0,0 +1,822 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "api/video_codecs/libaom_av1_encoder_factory.h" + +#include +#include +#include + +#include "api/video/i420_buffer.h" +#include "api/video_codecs/video_decoder.h" +#include "api/video_codecs/video_encoder_interface.h" +#include "common_video/libyuv/include/webrtc_libyuv.h" +#include "modules/video_coding/codecs/av1/dav1d_decoder.h" +#include "rtc_base/logging.h" +#include "test/gmock.h" +#include "test/gtest.h" +#include "test/testsupport/file_utils.h" +#include "test/testsupport/frame_reader.h" +#include "test/testsupport/frame_writer.h" + +namespace webrtc { +namespace { +using ::testing::_; +using ::testing::ElementsAre; +using ::testing::Eq; +using ::testing::Field; +using ::testing::Gt; +using ::testing::IsEmpty; +using ::testing::Lt; +using ::testing::MockFunction; +using ::testing::NotNull; +using Cbr = VideoEncoderInterface::FrameEncodeSettings::Cbr; +using Cqp = VideoEncoderInterface::FrameEncodeSettings::Cqp; +using EncodedData = VideoEncoderInterface::EncodedData; +using EncodeResult = VideoEncoderInterface::EncodeResult; +using EncodeResultCallback = VideoEncoderInterface::EncodeResultCallback; +using FrameType = VideoEncoderInterface::FrameType; + +std::unique_ptr CreateFrameReader() { + return CreateY4mFrameReader( + test::ResourcePath("reference_video_640x360_30fps", "y4m"), + test::YuvFrameReaderImpl::RepeatMode::kPingPong); +} + +std::string OutPath() { + std::string res = test::OutputPath(); + res += "frame_dump/"; + RTC_CHECK(test::DirExists(res) || test::CreateDir(res)); + return res; +} + +class EncodeResults { + public: + EncodeResultCallback CallBack() { + return [&](const EncodeResult& result) { results_.push_back(result); }; + } + + EncodedData* FrameAt(int index) { + if (index < 0 || index > static_cast(results_.size())) { + RTC_CHECK(false); + return nullptr; + } + return std::get_if(&results_[index]); + } + + private: + std::vector results_; +}; + +class Av1Decoder : public DecodedImageCallback { + public: + Av1Decoder() : Av1Decoder("") {} + + explicit Av1Decoder(const std::string& name) + : decoder_(CreateDav1dDecoder()), file_name_(name) { + decoder_->Configure({}); + decoder_->RegisterDecodeCompleteCallback(this); + + if (!file_name_.empty()) { + std::string out = OutPath(); + out += file_name_; + out += "_raw.av1"; + RTC_CHECK(raw_out_file_ = fopen(out.c_str(), "wb")); + RTC_LOG(LS_INFO) << "Recording bitstream to " << out; + } + } + + ~Av1Decoder() { + if (raw_out_file_) { + fclose(raw_out_file_); + } + } + + // DecodedImageCallback + int32_t Decoded(VideoFrame& frame) override { + decode_result_ = std::make_unique(std::move(frame)); + return 0; + } + + VideoFrame Decode(const EncodedData& encoded_data) { + EncodedImage img; + img.SetEncodedData(encoded_data.bitstream_data); + if (raw_out_file_) { + fwrite(encoded_data.bitstream_data->data(), 1, + encoded_data.bitstream_data->size(), raw_out_file_); + } + decoder_->Decode(img, /*dont_care=*/0); + VideoFrame res(std::move(*decode_result_)); + return res; + } + + private: + std::unique_ptr decoder_; + std::unique_ptr decode_result_; + std::string file_name_; + FILE* raw_out_file_ = nullptr; +}; + +class FrameEncoderSettingsBuilder { + public: + FrameEncoderSettingsBuilder& Key() { + frame_encode_settings_.frame_type = FrameType::kKeyframe; + return *this; + } + + FrameEncoderSettingsBuilder& Start() { + frame_encode_settings_.frame_type = FrameType::kStartFrame; + return *this; + } + + FrameEncoderSettingsBuilder& Delta() { + frame_encode_settings_.frame_type = FrameType::kStartFrame; + return *this; + } + + FrameEncoderSettingsBuilder& Rate( + const absl::variant& rate_options) { + frame_encode_settings_.rate_options = rate_options; + return *this; + } + + FrameEncoderSettingsBuilder& T(int id) { + frame_encode_settings_.temporal_id = id; + return *this; + } + + FrameEncoderSettingsBuilder& S(int id) { + frame_encode_settings_.spatial_id = id; + return *this; + } + + FrameEncoderSettingsBuilder& Res(int width, int height) { + frame_encode_settings_.resolution = {width, height}; + return *this; + } + + FrameEncoderSettingsBuilder& Ref(const std::vector& ref) { + frame_encode_settings_.reference_buffers = ref; + return *this; + } + + FrameEncoderSettingsBuilder& Upd(int upd) { + frame_encode_settings_.update_buffer = upd; + return *this; + } + + VideoEncoderInterface::FrameEncodeSettings Build() { + return frame_encode_settings_; + } + + private: + VideoEncoderInterface::FrameEncodeSettings frame_encode_settings_; +}; + +using Fb = FrameEncoderSettingsBuilder; + +// For reasonable debug printout when an EXPECT fail. +struct Resolution { + explicit Resolution(const VideoFrame& frame) + : width(frame.width()), height(frame.height()) {} + + friend void PrintTo(const Resolution& res, std::ostream* os) { + *os << "(width: " << res.width << " height: " << res.height << ")"; + } + + int width; + int height; +}; + +MATCHER_P2(ResolutionIs, width, height, "") { + return arg.width == width && arg.height == height; +} + +double Psnr(const rtc::scoped_refptr& ref_buffer, + const VideoFrame& decoded_frame) { + return I420PSNR(*ref_buffer, *decoded_frame.video_frame_buffer()->ToI420()); +} + +static constexpr VideoEncoderFactoryInterface::StaticEncoderSettings + kCbrEncoderSettings{ + .max_encode_dimensions = {.width = 1920, .height = 1080}, + .encoding_format = {.sub_sampling = EncodingFormat::SubSampling::k420, + .bit_depth = 8}, + .rc_mode = + VideoEncoderFactoryInterface::StaticEncoderSettings::Cbr{ + .max_buffer_size = TimeDelta::Millis(1000), + .target_buffer_size = TimeDelta::Millis(600)}, + .max_number_of_threads = 1, + }; + +static constexpr VideoEncoderFactoryInterface::StaticEncoderSettings + kCqpEncoderSettings{ + .max_encode_dimensions = {.width = 1920, .height = 1080}, + .encoding_format = {.sub_sampling = EncodingFormat::SubSampling::k420, + .bit_depth = 8}, + .rc_mode = VideoEncoderFactoryInterface::StaticEncoderSettings::Cqp(), + .max_number_of_threads = 1, + }; + +static constexpr Cbr kCbr{.duration = TimeDelta::Millis(100), + .target_bitrate = DataRate::KilobitsPerSec(1000)}; + +TEST(LibaomAv1EncoderFactory, CodecName) { + EXPECT_THAT(LibaomAv1EncoderFactory().CodecName(), Eq("AV1")); +} + +TEST(LibaomAv1EncoderFactory, CodecSpecifics) { + EXPECT_THAT(LibaomAv1EncoderFactory().CodecSpecifics(), IsEmpty()); +} + +TEST(LibaomAv1EncoderFactory, QpRange) { + const std::pair kMinMaxQp = {0, 63}; + EXPECT_THAT( + LibaomAv1EncoderFactory().GetEncoderCapabilities().rate_control.qp_range, + Eq(kMinMaxQp)); +} + +TEST(LibaomAv1Encoder, KeyframeUpdatesSpecifiedBuffer) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + Av1Decoder dec; + + auto raw_key = frame_reader->PullFrame(); + auto raw_delta = frame_reader->PullFrame(); + + enc->Encode(raw_key, {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(640, 360).Upd(5).Key().Build()}, + res.CallBack()); + ASSERT_THAT(res.FrameAt(0), NotNull()); + VideoFrame decoded_key = dec.Decode(*res.FrameAt(0)); + EXPECT_THAT(Resolution(decoded_key), ResolutionIs(640, 360)); + EXPECT_THAT(Psnr(raw_key, decoded_key), Gt(40)); + + enc->Encode(raw_delta, {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(640, 360).Ref({0}).Build()}, res.CallBack()); + ASSERT_THAT(res.FrameAt(1), Eq(nullptr)); +} + +TEST(LibaomAv1Encoder, MidTemporalUnitKeyframeResetsBuffers) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + Av1Decoder dec; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Upd(0).Key().Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({0}).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({0}).Build()}, + res.CallBack()); + ASSERT_THAT(res.FrameAt(2), NotNull()); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Upd(0).Ref({0}).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Upd(1).Key().Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({0}).Build()}, + res.CallBack()); + ASSERT_THAT(res.FrameAt(3), Eq(nullptr)); +} + +TEST(LibaomAv1Encoder, ResolutionSwitching) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + rtc::scoped_refptr in0 = frame_reader->PullFrame(); + enc->Encode(in0, {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(320, 180).Upd(0).Key().Build()}, + res.CallBack()); + + rtc::scoped_refptr in1 = frame_reader->PullFrame(); + enc->Encode(in1, {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(640, 360).Ref({0}).Build()}, res.CallBack()); + + rtc::scoped_refptr in2 = frame_reader->PullFrame(); + enc->Encode(in2, {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(160, 90).Ref({0}).Build()}, res.CallBack()); + + EXPECT_THAT(res.FrameAt(0), Field(&EncodedData::spatial_id, 0)); + EXPECT_THAT(res.FrameAt(1), Field(&EncodedData::spatial_id, 0)); + EXPECT_THAT(res.FrameAt(2), Field(&EncodedData::spatial_id, 0)); + + Av1Decoder dec; + VideoFrame f0 = dec.Decode(*res.FrameAt(0)); + EXPECT_THAT(Resolution(f0), ResolutionIs(320, 180)); + // TD: + // EXPECT_THAT(Psnr(in0, f0), Gt(40)); + + VideoFrame f1 = dec.Decode(*res.FrameAt(1)); + EXPECT_THAT(Resolution(f1), ResolutionIs(640, 360)); + EXPECT_THAT(Psnr(in1, f1), Gt(40)); + + VideoFrame f2 = dec.Decode(*res.FrameAt(2)); + EXPECT_THAT(Resolution(f2), ResolutionIs(160, 90)); + // TD: + // EXPECT_THAT(Psnr(in2, f2), Gt(40)); +} + +TEST(LibaomAv1Encoder, InputResolutionSwitching) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + rtc::scoped_refptr in0 = frame_reader->PullFrame(); + enc->Encode(in0, {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(160, 90).Upd(0).Key().Build()}, + res.CallBack()); + + rtc::scoped_refptr in1 = frame_reader->PullFrame( + /*frame_num=*/nullptr, + /*resolution=*/{320, 180}, + /*framerate_scale=*/{1, 1}); + enc->Encode(in1, {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(160, 90).Ref({0}).Build()}, res.CallBack()); + + rtc::scoped_refptr in2 = frame_reader->PullFrame( + /*frame_num=*/nullptr, + /*resolution=*/{160, 90}, + /*framerate_scale=*/{1, 1}); + enc->Encode(in2, {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(160, 90).Ref({0}).Build()}, res.CallBack()); + + EXPECT_THAT(res.FrameAt(0), Field(&EncodedData::spatial_id, 0)); + EXPECT_THAT(res.FrameAt(1), Field(&EncodedData::spatial_id, 0)); + EXPECT_THAT(res.FrameAt(2), Field(&EncodedData::spatial_id, 0)); + + Av1Decoder dec; + VideoFrame f0 = dec.Decode(*res.FrameAt(0)); + EXPECT_THAT(Resolution(f0), ResolutionIs(160, 90)); + // TD: + // EXPECT_THAT(Psnr(in0, f0), Gt(40)); + + VideoFrame f1 = dec.Decode(*res.FrameAt(1)); + EXPECT_THAT(Resolution(f1), ResolutionIs(160, 90)); + // TD: + // EXPECT_THAT(Psnr(in1, f1), Gt(40)); + + VideoFrame f2 = dec.Decode(*res.FrameAt(2)); + EXPECT_THAT(Resolution(f2), ResolutionIs(160, 90)); + EXPECT_THAT(Psnr(in2, f2), Gt(40)); +} + +TEST(LibaomAv1Encoder, TempoSpatial) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + const Cbr k10Fps{.duration = TimeDelta::Millis(100), + .target_bitrate = DataRate::KilobitsPerSec(500)}; + const Cbr k20Fps{.duration = TimeDelta::Millis(50), + .target_bitrate = DataRate::KilobitsPerSec(500)}; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(k10Fps).Res(160, 90).S(0).Upd(0).Key().Build(), + Fb().Rate(k10Fps).Res(320, 180).S(1).Ref({0}).Upd(1).Build(), + Fb().Rate(k20Fps).Res(640, 360).S(2).Ref({1}).Upd(2).Build()}, + res.CallBack()); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(50)}, + {Fb().Rate(k20Fps).Res(640, 360).S(2).Ref({2}).Upd(2).Build()}, + res.CallBack()); + + rtc::scoped_refptr frame = frame_reader->PullFrame(); + enc->Encode(frame, {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(k10Fps).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(k10Fps).Res(320, 180).S(1).Ref({0, 1}).Upd(1).Build(), + Fb().Rate(k20Fps).Res(640, 360).S(2).Ref({1, 2}).Upd(2).Build()}, + res.CallBack()); + + Av1Decoder dec; + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(0))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(1))), ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(2))), ResolutionIs(640, 360)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(3))), ResolutionIs(640, 360)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(4))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(5))), ResolutionIs(320, 180)); + + VideoFrame f = dec.Decode(*res.FrameAt(6)); + EXPECT_THAT(Resolution(f), ResolutionIs(640, 360)); + + EXPECT_THAT(Psnr(frame, f), Gt(40)); +} + +TEST(DISABLED_LibaomAv1Encoder, InvertedTempoSpatial) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(320, 180).S(0).Upd(0).Key().Build(), + Fb().Rate(kCbr).Res(640, 360).S(1).Ref({0}).Upd(1).Build()}, + res.CallBack()); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(320, 180).S(0).Ref({0}).Upd(0).Build()}, + res.CallBack()); + + rtc::scoped_refptr frame = frame_reader->PullFrame(); + enc->Encode(frame, {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(320, 180).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(640, 360).S(1).Ref({1, 0}).Upd(1).Build()}, + res.CallBack()); + + Av1Decoder dec; + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(0))), ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(1))), ResolutionIs(640, 360)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(2))), ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(3))), ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(4))), ResolutionIs(640, 360)); +} + +TEST(LibaomAv1Encoder, SkipMidLayer) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Upd(0).Key().Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({0}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({1}).Upd(2).Build()}, + res.CallBack()); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2}).Upd(2).Build()}, + res.CallBack()); + + rtc::scoped_refptr frame = frame_reader->PullFrame(); + enc->Encode(frame, {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({0, 1}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({1, 2}).Upd(2).Build()}, + res.CallBack()); + + Av1Decoder dec; + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(0))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(1))), ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(2))), ResolutionIs(640, 360)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(3))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(4))), ResolutionIs(640, 360)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(5))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(6))), ResolutionIs(320, 180)); + + VideoFrame f = dec.Decode(*res.FrameAt(7)); + EXPECT_THAT(Resolution(f), ResolutionIs(640, 360)); + EXPECT_THAT(Psnr(frame, f), Gt(40)); +} + +TEST(LibaomAv1Encoder, L3T1) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + Av1Decoder dec; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Upd(0).Key().Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({0}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({1}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(0))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(1))), ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(2))), ResolutionIs(640, 360)); + + auto tu1_frame = frame_reader->PullFrame(); + enc->Encode(tu1_frame, {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({1, 0}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2, 1}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(3))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(4))), ResolutionIs(320, 180)); + + VideoFrame f_tu1 = dec.Decode(*res.FrameAt(5)); + EXPECT_THAT(Resolution(f_tu1), ResolutionIs(640, 360)); + EXPECT_THAT(Psnr(tu1_frame, f_tu1), Gt(40)); + + auto tu2_frame = frame_reader->PullFrame(); + enc->Encode(tu2_frame, {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({1, 0}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2, 1}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(6))), ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec.Decode(*res.FrameAt(7))), ResolutionIs(320, 180)); + + VideoFrame f_tu2 = dec.Decode(*res.FrameAt(8)); + EXPECT_THAT(Resolution(f_tu2), ResolutionIs(640, 360)); + EXPECT_THAT(Psnr(tu2_frame, f_tu2), Gt(40)); +} + +TEST(LibaomAv1Encoder, L3T1_KEY) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + Av1Decoder dec_s0; + Av1Decoder dec_s1; + Av1Decoder dec_s2; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Upd(0).Key().Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({0}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({1}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec_s0.Decode(*res.FrameAt(0))), + ResolutionIs(160, 90)); + + dec_s1.Decode(*res.FrameAt(0)); + EXPECT_THAT(Resolution(dec_s1.Decode(*res.FrameAt(1))), + ResolutionIs(320, 180)); + + dec_s2.Decode(*res.FrameAt(0)); + dec_s2.Decode(*res.FrameAt(1)); + EXPECT_THAT(Resolution(dec_s2.Decode(*res.FrameAt(2))), + ResolutionIs(640, 360)); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({1}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec_s0.Decode(*res.FrameAt(3))), + ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec_s1.Decode(*res.FrameAt(4))), + ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec_s2.Decode(*res.FrameAt(5))), + ResolutionIs(640, 360)); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({1}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec_s0.Decode(*res.FrameAt(6))), + ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec_s1.Decode(*res.FrameAt(7))), + ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec_s2.Decode(*res.FrameAt(8))), + ResolutionIs(640, 360)); +} + +TEST(LibaomAv1Encoder, S3T1) { + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + + Av1Decoder dec_s0; + Av1Decoder dec_s1; + Av1Decoder dec_s2; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Start().Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Start().Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Start().Upd(2).Build()}, + res.CallBack()); + EXPECT_THAT(Resolution(dec_s0.Decode(*res.FrameAt(0))), + ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec_s1.Decode(*res.FrameAt(1))), + ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec_s2.Decode(*res.FrameAt(2))), + ResolutionIs(640, 360)); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(100)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({1}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec_s0.Decode(*res.FrameAt(3))), + ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec_s1.Decode(*res.FrameAt(4))), + ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec_s2.Decode(*res.FrameAt(5))), + ResolutionIs(640, 360)); + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(200)}, + {Fb().Rate(kCbr).Res(160, 90).S(0).Ref({0}).Upd(0).Build(), + Fb().Rate(kCbr).Res(320, 180).S(1).Ref({1}).Upd(1).Build(), + Fb().Rate(kCbr).Res(640, 360).S(2).Ref({2}).Upd(2).Build()}, + res.CallBack()); + + EXPECT_THAT(Resolution(dec_s0.Decode(*res.FrameAt(6))), + ResolutionIs(160, 90)); + EXPECT_THAT(Resolution(dec_s1.Decode(*res.FrameAt(7))), + ResolutionIs(320, 180)); + EXPECT_THAT(Resolution(dec_s2.Decode(*res.FrameAt(8))), + ResolutionIs(640, 360)); +} + +TEST(LibaomAv1Encoder, HigherEffortLevelYieldsHigherQualityFrames) { + auto frame_in = CreateFrameReader()->PullFrame(); + std::pair effort_range = LibaomAv1EncoderFactory() + .GetEncoderCapabilities() + .performance.min_max_effort_level; + // Cbr rc{.duration = TimeDelta::Millis(100), + // .target_bitrate = DataRate::KilobitsPerSec(100)}; + absl::optional psnr_last; + Av1Decoder dec; + + for (int i = effort_range.first; i <= effort_range.second; ++i) { + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + EncodeResults res; + enc->Encode( + frame_in, + {.presentation_timestamp = Timestamp::Millis(0), .effort_level = i}, + {Fb().Rate(kCbr).Res(640, 360).Upd(0).Key().Build()}, res.CallBack()); + double psnr = Psnr(frame_in, dec.Decode(*res.FrameAt(0))); + EXPECT_THAT(psnr, Gt(psnr_last)); + psnr_last = psnr; + } +} + +TEST(LibaomAv1Encoder, KeyframeAndStartrameAreApproximatelyEqual) { + int max_spatial_layers = LibaomAv1EncoderFactory() + .GetEncoderCapabilities() + .prediction_constraints.max_spatial_layers; + const Cbr kRate{.duration = TimeDelta::Millis(100), + .target_bitrate = DataRate::KilobitsPerSec(500)}; + + for (int sid = 0; sid < max_spatial_layers; ++sid) { + std::string key_name = "cbr_key_sl_"; + key_name += std::to_string(sid); + Av1Decoder dec_key(key_name); + + std::string start_name = "cbr_start_sl_"; + start_name += std::to_string(sid); + Av1Decoder dec_start(start_name); + + auto frame_reader = CreateFrameReader(); + auto enc_key = + LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + auto enc_start = + LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + DataSize total_size_key = DataSize::Zero(); + DataSize total_size_start = DataSize::Zero(); + TimeDelta total_duration = TimeDelta::Zero(); + EncodeResults res_key; + EncodeResults res_start; + auto frame_in = frame_reader->PullFrame(); + enc_key->Encode( + frame_in, {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kRate).Res(640, 360).S(sid).Upd(0).Key().Build()}, + res_key.CallBack()); + enc_start->Encode( + frame_in, {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kRate).Res(640, 360).S(sid).Start().Upd(0).Build()}, + res_start.CallBack()); + total_size_key += + DataSize::Bytes(res_key.FrameAt(0)->bitstream_data->size()); + total_size_start += + DataSize::Bytes(res_start.FrameAt(0)->bitstream_data->size()); + total_duration += kRate.duration; + dec_key.Decode(*res_key.FrameAt(0)); + dec_start.Decode(*res_start.FrameAt(0)); + + EXPECT_NEAR(total_size_key.bytes(), total_size_start.bytes(), + 0.1 * total_size_key.bytes()); + + for (int f = 1; f < 10; ++f) { + frame_in = frame_reader->PullFrame(); + enc_key->Encode( + frame_in, {.presentation_timestamp = Timestamp::Millis(f * 100)}, + {Fb().Rate(kRate).Res(640, 360).S(sid).Ref({0}).Upd(0).Build()}, + res_key.CallBack()); + enc_start->Encode( + frame_in, {.presentation_timestamp = Timestamp::Millis(f * 100)}, + {Fb().Rate(kRate).Res(640, 360).S(sid).Ref({0}).Upd(0).Build()}, + res_start.CallBack()); + total_size_key += + DataSize::Bytes(res_key.FrameAt(f)->bitstream_data->size()); + total_size_start += + DataSize::Bytes(res_start.FrameAt(f)->bitstream_data->size()); + total_duration += kRate.duration; + dec_key.Decode(*res_key.FrameAt(f)); + dec_start.Decode(*res_start.FrameAt(f)); + } + + double key_encode_kbps = (total_size_key / total_duration).kbps(); + double start_encode_kbps = (total_size_start / total_duration).kbps(); + + EXPECT_NEAR(key_encode_kbps, start_encode_kbps, start_encode_kbps * 0.05); + } +} + +TEST(LibaomAv1Encoder, BitrateConsistentAcrossSpatialLayers) { + int max_spatial_layers = LibaomAv1EncoderFactory() + .GetEncoderCapabilities() + .prediction_constraints.max_spatial_layers; + const Cbr kRate{.duration = TimeDelta::Millis(100), + .target_bitrate = DataRate::KilobitsPerSec(500)}; + + for (int sid = 0; sid < max_spatial_layers; ++sid) { + std::string out_name = "cbr_sl_"; + out_name += std::to_string(sid); + Av1Decoder dec(out_name); + + auto frame_reader = CreateFrameReader(); + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCbrEncoderSettings, {}); + DataSize total_size = DataSize::Zero(); + TimeDelta total_duration = TimeDelta::Zero(); + EncodeResults res; + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(kRate).Res(640, 360).S(sid).Upd(0).Key().Build()}, + res.CallBack()); + total_size += DataSize::Bytes(res.FrameAt(0)->bitstream_data->size()); + total_duration += kRate.duration; + dec.Decode(*res.FrameAt(0)); + + for (int f = 1; f < 30; ++f) { + enc->Encode( + frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(f * 100)}, + {Fb().Rate(kRate).Res(640, 360).S(sid).Ref({0}).Upd(0).Build()}, + res.CallBack()); + total_size += DataSize::Bytes(res.FrameAt(f)->bitstream_data->size()); + total_duration += kRate.duration; + dec.Decode(*res.FrameAt(f)); + } + + double encode_kbps = (total_size / total_duration).kbps(); + double target_kbps = kRate.target_bitrate.kbps(); + + EXPECT_NEAR(encode_kbps, target_kbps, target_kbps * 0.1); + } +} + +TEST(LibaomAv1Encoder, ConstantQp) { + int max_spatial_layers = LibaomAv1EncoderFactory() + .GetEncoderCapabilities() + .prediction_constraints.max_spatial_layers; + constexpr int kQp = 30; + for (int sid = 0; sid < max_spatial_layers; ++sid) { + auto enc = LibaomAv1EncoderFactory().CreateEncoder(kCqpEncoderSettings, {}); + std::string out_name = "cqp_sl_"; + out_name += std::to_string(sid); + Av1Decoder dec(out_name); + DataSize total_size = DataSize::Zero(); + auto frame_reader = CreateFrameReader(); + EncodeResults res; + + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(0)}, + {Fb().Rate(Cqp{.target_qp = kQp}) + .Res(640, 360) + .S(sid) + .Upd(0) + .Key() + .Build()}, + res.CallBack()); + EXPECT_THAT(res.FrameAt(0)->encoded_qp, Eq(kQp)); + total_size += DataSize::Bytes(res.FrameAt(0)->bitstream_data->size()); + dec.Decode(*res.FrameAt(0)); + + for (int f = 1; f < 10; ++f) { + enc->Encode(frame_reader->PullFrame(), + {.presentation_timestamp = Timestamp::Millis(f * 100)}, + {Fb().Rate(Cqp{.target_qp = kQp - f}) + .Res(640, 360) + .S(sid) + .Ref({0}) + .Upd(0) + .Build()}, + res.CallBack()); + EXPECT_THAT(res.FrameAt(f)->encoded_qp, Eq(kQp - f)); + dec.Decode(*res.FrameAt(f)); + } + } +} + +} // namespace +} // namespace webrtc diff --git a/api/video_codecs/simple_encoder_wrapper.cc b/api/video_codecs/simple_encoder_wrapper.cc new file mode 100644 index 0000000000..bb2eda6afb --- /dev/null +++ b/api/video_codecs/simple_encoder_wrapper.cc @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "api/video_codecs/simple_encoder_wrapper.h" + +#include +#include +#include +#include +#include + +#include "absl/algorithm/container.h" +#include "api/video_codecs/scalability_mode.h" +#include "api/video_codecs/scalability_mode_helper.h" +#include "api/video_codecs/video_encoder_factory_interface.h" +#include "api/video_codecs/video_encoder_interface.h" +#include "modules/video_coding/svc/create_scalability_structure.h" + +namespace webrtc { +using PredictionConstraints = + VideoEncoderFactoryInterface::Capabilities::PredictionConstraints; +using FrameEncodeSettings = VideoEncoderInterface::FrameEncodeSettings; + +namespace { +enum class Inter { kS, kL, kKey }; +enum class Scaling { k1_2, k2_3 }; +std::string SvcToString(int spatial_layers, + int temporal_layers, + Inter inter, + Scaling scaling) { + RTC_CHECK(spatial_layers > 1 || inter == Inter::kL); + std::string res; + res += inter == Inter::kS ? "S" : "L"; + res += std::to_string(spatial_layers); + res += "T"; + res += std::to_string(temporal_layers); + if (scaling == Scaling::k2_3) { + res += "h"; + } + if (inter == Inter::kKey) { + res += "_KEY"; + } + + return res; +} +} // namespace + +// static +std::vector SimpleEncoderWrapper::SupportedWebrtcSvcModes( + const PredictionConstraints& prediction_constraints) { + std::vector res; + + const int max_spatial_layers = + std::min(3, prediction_constraints.max_spatial_layers); + const int max_temporal_layers = + std::min(3, prediction_constraints.max_temporal_layers); + const bool scale_by_half = absl::c_linear_search( + prediction_constraints.scaling_factors, Rational{1, 2}); + const bool scale_by_two_thirds = absl::c_linear_search( + prediction_constraints.scaling_factors, Rational{2, 3}); + const bool inter_layer = + prediction_constraints.max_references > 1 && + prediction_constraints.buffer_space_type != + PredictionConstraints::BufferSpaceType::kMultiInstance; + + for (int s = 1; s <= max_spatial_layers; ++s) { + for (int t = 1; t <= max_temporal_layers; ++t) { + if (prediction_constraints.num_buffers > ((std::max(1, t - 1) * s) - 1)) { + if (s == 1 || inter_layer) { + res.push_back(SvcToString(s, t, Inter::kL, Scaling::k1_2)); + if (s == 1) { + continue; + } + } + if (scale_by_half) { + res.push_back(SvcToString(s, t, Inter::kS, Scaling::k1_2)); + if (inter_layer) { + res.push_back(SvcToString(s, t, Inter::kKey, Scaling::k1_2)); + } + } + if (scale_by_two_thirds) { + res.push_back(SvcToString(s, t, Inter::kS, Scaling::k2_3)); + if (inter_layer) { + res.push_back(SvcToString(s, t, Inter::kKey, Scaling::k2_3)); + res.push_back(SvcToString(s, t, Inter::kL, Scaling::k2_3)); + } + } + } + } + } + + return res; +} + +// static +std::unique_ptr SimpleEncoderWrapper::Create( + std::unique_ptr encoder, + absl::string_view scalability_mode) { + if (!encoder) { + return nullptr; + } + + absl::optional sm = + ScalabilityModeStringToEnum(scalability_mode); + if (!sm) { + return nullptr; + } + + std::unique_ptr svc_controller = + CreateScalabilityStructure(*sm); + if (!svc_controller) { + return nullptr; + } + + return std::make_unique(std::move(encoder), + std::move(svc_controller)); +} + +SimpleEncoderWrapper::SimpleEncoderWrapper( + std::unique_ptr encoder, + std::unique_ptr svc_controller) + : encoder_(std::move(encoder)), + svc_controller_(std::move(svc_controller)), + layer_configs_(svc_controller_->StreamConfig()) {} + +void SimpleEncoderWrapper::SetEncodeQp(int qp) { + target_qp_ = qp; +} + +void SimpleEncoderWrapper::SetEncodeFps(int fps) { + fps_ = fps; +} + +void SimpleEncoderWrapper::Encode( + rtc::scoped_refptr frame_buffer, + bool force_keyframe, + EncodeResultCallback callback) { + std::vector configs = + svc_controller_->NextFrameConfig(force_keyframe); + std::vector encode_settings; + std::vector frame_infos; + + bool include_dependency_structure = false; + + for (size_t s = 0; s < configs.size(); ++s) { + const ScalableVideoController::LayerFrameConfig& config = configs[s]; + frame_infos.push_back(svc_controller_->OnEncodeDone(config)); + FrameEncodeSettings& settings = encode_settings.emplace_back(); + settings.rate_options = VideoEncoderInterface::FrameEncodeSettings::Cqp{ + .target_qp = target_qp_}; + settings.spatial_id = config.SpatialId(); + settings.temporal_id = config.TemporalId(); + const int num = layer_configs_.scaling_factor_num[s]; + const int den = layer_configs_.scaling_factor_den[s]; + settings.resolution = {(frame_buffer->width() * num / den), + (frame_buffer->height() * num / den)}; + + bool buffer_updated = false; + for (const CodecBufferUsage& buffer : config.Buffers()) { + if (buffer.referenced) { + settings.reference_buffers.push_back(buffer.id); + } + if (buffer.updated) { + RTC_CHECK(!buffer_updated); + settings.update_buffer = buffer.id; + buffer_updated = true; + } + } + + if (settings.reference_buffers.empty()) { + settings.frame_type = FrameType::kKeyframe; + include_dependency_structure = true; + } + } + + absl::optional dependency_structure; + if (include_dependency_structure) { + dependency_structure = svc_controller_->DependencyStructure(); + } + + VideoEncoderInterface::EncodeResultCallback callback_internal = + [cb = std::move(callback), ds = std::move(dependency_structure), + infos = std::move(frame_infos)]( + const VideoEncoderInterface::EncodeResult& result) mutable { + auto* data = std::get_if(&result); + EncodeResult res; + if (!data || data->spatial_id >= static_cast(infos.size())) { + res.oh_no = true; + cb(res); + return; + } + + res.frame_type = data->frame_type; + res.bitstream_data = std::move(data->bitstream_data); + res.generic_frame_info = infos[data->spatial_id]; + if (data->referenced_buffers.empty()) { + // Keyframe + res.dependency_structure = ds; + } + cb(res); + }; + + encoder_->Encode(std::move(frame_buffer), + {.presentation_timestamp = presentation_timestamp_}, + encode_settings, std::move(callback_internal)); + presentation_timestamp_ += 1 / Frequency::Hertz(fps_); +} + +} // namespace webrtc diff --git a/api/video_codecs/simple_encoder_wrapper.h b/api/video_codecs/simple_encoder_wrapper.h new file mode 100644 index 0000000000..4d11020924 --- /dev/null +++ b/api/video_codecs/simple_encoder_wrapper.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef API_VIDEO_CODECS_SIMPLE_ENCODER_WRAPPER_H_ +#define API_VIDEO_CODECS_SIMPLE_ENCODER_WRAPPER_H_ + +#include +#include +#include + +#include "absl/functional/any_invocable.h" +#include "api/units/data_rate.h" +#include "api/video_codecs/video_encoder_factory_interface.h" +#include "api/video_codecs/video_encoder_interface.h" +#include "common_video/generic_frame_descriptor/generic_frame_info.h" +#include "modules/video_coding/svc/create_scalability_structure.h" + +namespace webrtc { +class SimpleEncoderWrapper { + public: + struct EncodeResult { + bool oh_no = false; + rtc::scoped_refptr bitstream_data; + FrameType frame_type; + GenericFrameInfo generic_frame_info; + absl::optional dependency_structure; + }; + + using EncodeResultCallback = + absl::AnyInvocable; + + static std::vector SupportedWebrtcSvcModes( + const VideoEncoderFactoryInterface::Capabilities::PredictionConstraints& + prediction_constraints); + + static std::unique_ptr Create( + std::unique_ptr encoder, + absl::string_view scalability_mode); + + // Should be private, use the Create function instead. + SimpleEncoderWrapper(std::unique_ptr encoder, + std::unique_ptr svc_controller); + + // We should really only support CBR, but then we have to think about layer + // allocations... eh... For this PoC just use CQP. + void SetEncodeQp(int qp); + + void SetEncodeFps(int fps); + + void Encode(rtc::scoped_refptr frame_buffer, + bool force_keyframe, + EncodeResultCallback callback); + + private: + std::unique_ptr encoder_; + std::unique_ptr svc_controller_; + ScalableVideoController::StreamLayersConfig layer_configs_; + int target_qp_ = 0; + int fps_ = 0; + Timestamp presentation_timestamp_ = Timestamp::Zero(); +}; + +} // namespace webrtc +#endif // API_VIDEO_CODECS_SIMPLE_ENCODER_WRAPPER_H_ diff --git a/api/video_codecs/simple_encoder_wrapper_unittests.cc b/api/video_codecs/simple_encoder_wrapper_unittests.cc new file mode 100644 index 0000000000..83518b6be5 --- /dev/null +++ b/api/video_codecs/simple_encoder_wrapper_unittests.cc @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "api/video_codecs/simple_encoder_wrapper.h" + +#include "api/video/i420_buffer.h" +#include "api/video_codecs/libaom_av1_encoder_factory.h" +#include "api/video_codecs/video_encoder_interface.h" +#include "test/gmock.h" +#include "test/gtest.h" +#include "test/testsupport/file_utils.h" +#include "test/testsupport/frame_reader.h" + +namespace webrtc { + +using ::testing::Eq; +using ::testing::Gt; +using ::testing::IsEmpty; +using ::testing::Ne; +using ::testing::Not; +using ::testing::NotNull; +using ::testing::UnorderedElementsAre; +using PredictionConstraints = + VideoEncoderFactoryInterface::Capabilities::PredictionConstraints; + +namespace { + +std::unique_ptr CreateFrameReader() { + return CreateY4mFrameReader( + test::ResourcePath("reference_video_640x360_30fps", "y4m"), + test::YuvFrameReaderImpl::RepeatMode::kPingPong); +} + +TEST(SimpleEncoderWrapper, SupportedSvcModesOnlyL1T1) { + PredictionConstraints constraints = { + .num_buffers = 2, + .max_references = 2, + .max_temporal_layers = 1, + .buffer_space_type = + PredictionConstraints::BufferSpaceType::kSingleKeyframe, + .max_spatial_layers = 1, + .scaling_factors = {{1, 1}}, + }; + + EXPECT_THAT(SimpleEncoderWrapper::SupportedWebrtcSvcModes(constraints), + UnorderedElementsAre("L1T1")); +} + +TEST(SimpleEncoderWrapper, SupportedSvcModesUpToL1T3) { + PredictionConstraints constraints = { + .num_buffers = 8, + .max_references = 1, + .max_temporal_layers = 3, + .buffer_space_type = + PredictionConstraints::BufferSpaceType::kSingleKeyframe, + .max_spatial_layers = 1, + .scaling_factors = {{1, 1}, {1, 2}}, + }; + + EXPECT_THAT(SimpleEncoderWrapper::SupportedWebrtcSvcModes(constraints), + UnorderedElementsAre("L1T1", "L1T2", "L1T3")); +} + +TEST(SimpleEncoderWrapper, SupportedSvcModesUpToL3T3Key) { + PredictionConstraints constraints = { + .num_buffers = 8, + .max_references = 2, + .max_temporal_layers = 3, + .buffer_space_type = + PredictionConstraints::BufferSpaceType::kSingleKeyframe, + .max_spatial_layers = 3, + .scaling_factors = {{1, 1}, {1, 2}}, + }; + + EXPECT_THAT( + SimpleEncoderWrapper::SupportedWebrtcSvcModes(constraints), + UnorderedElementsAre("L1T1", "L1T2", "L1T3", "L2T1", "L2T1_KEY", "L2T2", + "L2T2_KEY", "L2T3", "L2T3_KEY", "L3T1", "L3T1_KEY", + "L3T2", "L3T2_KEY", "L3T3", "L3T3_KEY", "S2T1", + "S2T2", "S2T3", "S3T1", "S3T2", "S3T3")); +} + +TEST(SimpleEncoderWrapper, SupportedSvcModesUpToS3T3) { + PredictionConstraints constraints = { + .num_buffers = 8, + .max_references = 2, + .max_temporal_layers = 3, + .buffer_space_type = + PredictionConstraints::BufferSpaceType::kMultiInstance, + .max_spatial_layers = 3, + .scaling_factors = {{1, 1}, {1, 2}}, + }; + + EXPECT_THAT(SimpleEncoderWrapper::SupportedWebrtcSvcModes(constraints), + UnorderedElementsAre("L1T1", "L1T2", "L1T3", "S2T1", "S2T2", + "S2T3", "S3T1", "S3T2", "S3T3")); +} + +TEST(SimpleEncoderWrapper, SupportedSvcModesUpToL3T3KeyWithHScaling) { + PredictionConstraints constraints = { + .num_buffers = 8, + .max_references = 2, + .max_temporal_layers = 3, + .buffer_space_type = + PredictionConstraints::BufferSpaceType::kSingleKeyframe, + .max_spatial_layers = 3, + .scaling_factors = {{1, 1}, {1, 2}, {2, 3}}, + }; + + EXPECT_THAT( + SimpleEncoderWrapper::SupportedWebrtcSvcModes(constraints), + UnorderedElementsAre( + "L1T1", "L1T2", "L1T3", "L2T1", "L2T1h", "L2T1_KEY", "L2T1h_KEY", + "L2T2", "L2T2h", "L2T2_KEY", "L2T2h_KEY", "L2T3", "L2T3h", "L2T3_KEY", + "L2T3h_KEY", "L3T1", "L3T1h", "L3T1_KEY", "L3T1h_KEY", "L3T2", + "L3T2h", "L3T2_KEY", "L3T2h_KEY", "L3T3", "L3T3h", "L3T3_KEY", + "L3T3h_KEY", "S2T1", "S2T1h", "S2T2", "S2T2h", "S2T3", "S2T3h", + "S3T1", "S3T1h", "S3T2", "S3T2h", "S3T3", "S3T3h")); +} + +// TD: The encoder wrapper shouldn't really use an actual encoder implementation +// for testing, but hey, this is just a PoC. +TEST(SimpleEncoderWrapper, EncodeL1T1) { + auto encoder = LibaomAv1EncoderFactory().CreateEncoder( + {.max_encode_dimensions = {1080, 720}, + .encoding_format = {.sub_sampling = EncodingFormat::k420, + .bit_depth = 8}, + .rc_mode = VideoEncoderFactoryInterface::StaticEncoderSettings::Cqp(), + .max_number_of_threads = 1}, + {}); + + std::unique_ptr simple_encoder = + SimpleEncoderWrapper::Create(std::move(encoder), "L1T1"); + + ASSERT_THAT(simple_encoder, NotNull()); + + simple_encoder->SetEncodeQp(30); + simple_encoder->SetEncodeFps(15); + auto frame_reader = CreateFrameReader(); + + int num_callbacks = 0; + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/true, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.dependency_structure, Ne(absl::nullopt)); + EXPECT_THAT(result.bitstream_data, NotNull()); + EXPECT_THAT(result.frame_type, Eq(FrameType::kKeyframe)); + EXPECT_THAT(result.generic_frame_info.spatial_id, Eq(0)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(0)); + }); + + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/false, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.dependency_structure, Eq(absl::nullopt)); + EXPECT_THAT(result.bitstream_data, NotNull()); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.spatial_id, Eq(0)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(0)); + }); +} + +TEST(SimpleEncoderWrapper, DISABLED_EncodeL2T2_KEY) { + auto encoder = LibaomAv1EncoderFactory().CreateEncoder( + {.max_encode_dimensions = {1080, 720}, + .encoding_format = {.sub_sampling = EncodingFormat::k420, + .bit_depth = 8}, + .rc_mode = VideoEncoderFactoryInterface::StaticEncoderSettings::Cqp(), + .max_number_of_threads = 1}, + {}); + + std::unique_ptr simple_encoder = + SimpleEncoderWrapper::Create(std::move(encoder), "L2T2_KEY"); + + ASSERT_THAT(simple_encoder, NotNull()); + + simple_encoder->SetEncodeQp(30); + simple_encoder->SetEncodeFps(15); + auto frame_reader = CreateFrameReader(); + + int num_callbacks = 0; + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/true, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ASSERT_THAT(result.oh_no, Eq(false)); + if (result.generic_frame_info.spatial_id == 0) { + ++num_callbacks; + EXPECT_THAT(result.dependency_structure, Ne(absl::nullopt)); + EXPECT_THAT(result.bitstream_data, NotNull()); + EXPECT_THAT(result.frame_type, Eq(FrameType::kKeyframe)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(0)); + } else if (result.generic_frame_info.spatial_id == 1) { + ++num_callbacks; + EXPECT_THAT(result.dependency_structure, Eq(absl::nullopt)); + EXPECT_THAT(result.bitstream_data, NotNull()); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(0)); + } + }); + + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/false, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ASSERT_THAT(result.oh_no, Eq(false)); + if (result.generic_frame_info.spatial_id == 0) { + ++num_callbacks; + EXPECT_THAT(result.dependency_structure, Eq(absl::nullopt)); + EXPECT_THAT(result.bitstream_data, NotNull()); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(1)); + } else if (result.generic_frame_info.spatial_id == 1) { + ++num_callbacks; + EXPECT_THAT(result.dependency_structure, Eq(absl::nullopt)); + EXPECT_THAT(result.bitstream_data, NotNull()); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(1)); + } + }); + + EXPECT_THAT(num_callbacks, Eq(4)); +} + +TEST(SimpleEncoderWrapper, DISABLED_EncodeL1T3ForceKeyframe) { + auto encoder = LibaomAv1EncoderFactory().CreateEncoder( + {.max_encode_dimensions = {1080, 720}, + .encoding_format = {.sub_sampling = EncodingFormat::k420, + .bit_depth = 8}, + .rc_mode = VideoEncoderFactoryInterface::StaticEncoderSettings::Cqp(), + .max_number_of_threads = 1}, + {}); + + std::unique_ptr simple_encoder = + SimpleEncoderWrapper::Create(std::move(encoder), "L1T3"); + + ASSERT_THAT(simple_encoder, NotNull()); + + simple_encoder->SetEncodeQp(30); + simple_encoder->SetEncodeFps(15); + auto frame_reader = CreateFrameReader(); + + int num_callbacks = 0; + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/true, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.frame_type, Eq(FrameType::kKeyframe)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(0)); + }); + + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/false, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(2)); + }); + + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/false, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(1)); + }); + + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/true, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.frame_type, Eq(FrameType::kKeyframe)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(0)); + }); + + simple_encoder->Encode( + frame_reader->PullFrame(), /*force_keyframe=*/false, + [&](const SimpleEncoderWrapper::EncodeResult& result) { + ++num_callbacks; + ASSERT_THAT(result.oh_no, Eq(false)); + EXPECT_THAT(result.frame_type, Eq(FrameType::kDeltaFrame)); + EXPECT_THAT(result.generic_frame_info.temporal_id, Eq(2)); + }); + + EXPECT_THAT(num_callbacks, Eq(5)); +} + +} // namespace +} // namespace webrtc diff --git a/api/video_codecs/video_encoder_factory_interface.h b/api/video_codecs/video_encoder_factory_interface.h new file mode 100644 index 0000000000..63b21d4ce0 --- /dev/null +++ b/api/video_codecs/video_encoder_factory_interface.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2024 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef API_VIDEO_CODECS_VIDEO_ENCODER_FACTORY_INTERFACE_H_ +#define API_VIDEO_CODECS_VIDEO_ENCODER_FACTORY_INTERFACE_H_ + +#include +#include +#include +#include +#include +#include + +#include "absl/types/optional.h" +#include "absl/types/variant.h" +#include "api/units/time_delta.h" +#include "api/video/resolution.h" +#include "api/video_codecs/video_encoder_interface.h" +#include "api/video_codecs/video_encoding_general.h" +#include "rtc_base/numerics/rational.h" + +namespace webrtc { +using FrameType = VideoEncoderInterface::FrameType; + +// NOTE: This class is still under development and may change without notice. +class VideoEncoderFactoryInterface { + public: + enum class RateControlMode { kCqp, kCbr }; + + struct Capabilities { + struct PredictionConstraints { + enum class BufferSpaceType { + kMultiInstance, // multiple independent sets of buffers + kMultiKeyframe, // single set of buffers, but can store multiple + // keyframes simultaneously. + kSingleKeyframe // single set of buffers, can only store one keyframe + // at a time. + }; + + int num_buffers; + int max_references; + int max_temporal_layers; + + BufferSpaceType buffer_space_type; + int max_spatial_layers; + std::vector scaling_factors; + + std::vector supported_frame_types; + } prediction_constraints; + + struct InputConstraints { + Resolution min; + Resolution max; + int pixel_alignment; + std::vector input_formats; + } input_constraints; + + std::vector encoding_formats; + + struct BitrateControl { + std::pair qp_range; + std::vector rc_modes; + } rate_control; + + struct Performance { + std::pair min_max_effort_level; + } performance; + }; + + struct StaticEncoderSettings { + struct Cqp {}; + struct Cbr { + // TD: Should there be an intial buffer size? + TimeDelta max_buffer_size; + TimeDelta target_buffer_size; + }; + + Resolution max_encode_dimensions; + EncodingFormat encoding_format; + absl::variant rc_mode; + int max_number_of_threads; + }; + + virtual ~VideoEncoderFactoryInterface() = default; + + virtual std::string CodecName() const = 0; + virtual std::string ImplementationName() const = 0; + virtual std::map CodecSpecifics() const = 0; + + virtual Capabilities GetEncoderCapabilities() const = 0; + virtual std::unique_ptr CreateEncoder( + const StaticEncoderSettings& settings, + const std::map& encoder_specific_settings) = 0; +}; + +} // namespace webrtc +#endif // API_VIDEO_CODECS_VIDEO_ENCODER_FACTORY_INTERFACE_H_ diff --git a/api/video_codecs/video_encoder_interface.h b/api/video_codecs/video_encoder_interface.h new file mode 100644 index 0000000000..9d0ce970ef --- /dev/null +++ b/api/video_codecs/video_encoder_interface.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef API_VIDEO_CODECS_VIDEO_ENCODER_INTERFACE_H_ +#define API_VIDEO_CODECS_VIDEO_ENCODER_INTERFACE_H_ + +#include +#include +// #include +#include +#include +#include + +#include "absl/functional/any_invocable.h" +#include "absl/types/optional.h" +#include "absl/types/variant.h" +#include "api/units/data_rate.h" +#include "api/units/time_delta.h" +#include "api/units/timestamp.h" +#include "api/video/encoded_image.h" +#include "api/video/resolution.h" +#include "api/video/video_frame.h" +#include "api/video_codecs/video_codec.h" +#include "api/video_codecs/video_encoding_general.h" +#include "rtc_base/numerics/rational.h" + +namespace webrtc { +// NOTE: This class is still under development and may change without notice. +class VideoEncoderInterface { + public: + virtual ~VideoEncoderInterface() = default; + enum class FrameType { kKeyframe, kStartFrame, kDeltaFrame }; + + struct TemporalUnitSettings { + VideoCodecMode content_hint = VideoCodecMode::kRealtimeVideo; + Timestamp presentation_timestamp; + int effort_level = 0; + }; + + struct FrameEncodeSettings { + struct Cbr { + TimeDelta duration; + DataRate target_bitrate; + }; + + struct Cqp { + int target_qp; + }; + + absl::variant rate_options; + + FrameType frame_type = FrameType::kDeltaFrame; + int temporal_id = 0; + int spatial_id = 0; + Resolution resolution; + std::vector reference_buffers; + absl::optional update_buffer; + }; + + // Results from calling Encode. Called once for each configured frame. + struct EncodingError {}; + + struct EncodedData { + rtc::scoped_refptr bitstream_data; + FrameType frame_type; + int spatial_id; + int encoded_qp; + std::vector referenced_buffers; + }; + + using EncodeResult = std::variant; + using EncodeResultCallback = + absl::AnyInvocable; + + virtual void Encode(rtc::scoped_refptr frame_buffer, + const TemporalUnitSettings& settings, + const std::vector& frame_settings, + EncodeResultCallback encode_result_callback) = 0; +}; + +} // namespace webrtc +#endif // API_VIDEO_CODECS_VIDEO_ENCODER_INTERFACE_H_ diff --git a/api/video_codecs/video_encoding_general.h b/api/video_codecs/video_encoding_general.h new file mode 100644 index 0000000000..171e211dba --- /dev/null +++ b/api/video_codecs/video_encoding_general.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2023 The WebRTC project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef API_VIDEO_CODECS_VIDEO_ENCODING_GENERAL_H_ +#define API_VIDEO_CODECS_VIDEO_ENCODING_GENERAL_H_ + +namespace webrtc { + +struct EncodingFormat { + enum SubSampling { k420, k422, k444 }; + SubSampling sub_sampling; + int bit_depth; +}; + +} // namespace webrtc +#endif // API_VIDEO_CODECS_VIDEO_ENCODING_GENERAL_H_ diff --git a/rtc_base/BUILD.gn b/rtc_base/BUILD.gn index e5ead5b0f5..a7a2adadfc 100644 --- a/rtc_base/BUILD.gn +++ b/rtc_base/BUILD.gn @@ -815,6 +815,7 @@ rtc_library("rtc_numerics") { "numerics/moving_average.h", "numerics/moving_percentile_filter.h", "numerics/percentile_filter.h", + "numerics/rational.h", "numerics/running_statistics.h", "numerics/sequence_number_unwrapper.h", "numerics/sequence_number_util.h", diff --git a/rtc_base/numerics/rational.h b/rtc_base/numerics/rational.h new file mode 100644 index 0000000000..32f0cb1597 --- /dev/null +++ b/rtc_base/numerics/rational.h @@ -0,0 +1,28 @@ +/* + * Copyright 2024 The WebRTC Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef RTC_BASE_NUMERICS_RATIONAL_H_ +#define RTC_BASE_NUMERICS_RATIONAL_H_ + +namespace webrtc { + +// This is the worst implementation of a rational... +struct Rational { + int numerator; + int denominator; + + bool operator==(const Rational& other) const { + return numerator == other.numerator && denominator == other.denominator; + } +}; + +} // namespace webrtc + +#endif // RTC_BASE_NUMERICS_RATIONAL_H_