Reland "Mark frames with inter_layer_predicted=true as delta frames"

This is a reland of commit 7ae48c452a with updated RtpVp9RefFinder RtpVp9RefFinder relied on the fact that frames with (inter_pic_predicted=true && inter_layer_predicted=true) were marked as keyframes. Since this is not the case anymore, the related code paths in RtpVp9RefFinder have been deleted. Calculation of gof_info_[] index for non-keyframes has been updated to account for that fact it is now possible to received multiple T0 frames belonging to the same temporal unit (we don't need to do "unwrapped_tl0 - 1" in this case). Original change's description: > Mark frames with inter_layer_predicted=true as delta frames > > As it is currently implemented, the VP9 depacketizer decides packet's frame type based on p_bit ("Inter-picture predicted layer frame"). p_bit is set to 0 for upper spatial layer frames of keyframe since they do not have temporal refs. This results in marking packets of upper spatial layer frames, and, eventually these frames, of SVC keyframes as "keyframe" while they are in fact delta frames. > > Normally spatial layer frames are merged into a superframe and the superframe is passed to decoder. But passing individual layers to a single decoder instance is a valid scenario too and is used in downstream projects. In this case, an upper layer frame marked as keyframe may cause decoder reset [2] and break decoding. > > This CL changes frame type decision logic in the VP9 depacketizer such that only packets with both P and D (inter-layer predicted) bits unset are considered as keyframe packets. > > When spatial layer frames are merged into a superframe in CombineAndDeleteFrames [1], frame type of the superframe is inferred from the lowest spatial layer frame. > > [1] https://source.chromium.org/chromium/chromium/src/+/main:third_party/webrtc/modules/video_coding/frame_helpers.cc;l=53 > > [2] https://source.corp.google.com/piper///depot/google3/third_party/webrtc/files/stable/webrtc/modules/video_coding/codecs/vp9/libvpx_vp9_decoder.cc;l=209 > > Bug: webrtc:15827 > Change-Id: Idc3445636f0eae0192dac998876fedec48628560 > Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/343342 > Reviewed-by: Danil Chapovalov <danilchap@webrtc.org> > Commit-Queue: Sergey Silkin <ssilkin@webrtc.org> > Cr-Commit-Position: refs/heads/main@{#41939} Bug: webrtc:15827 Change-Id: Ic69b94989919cf6d353bceea85d0eba63bc500ee Reviewed-on: https://webrtc-review.googlesource.com/c/src/+/344144 Reviewed-by: Philip Eliasson <philipel@webrtc.org> Commit-Queue: Sergey Silkin <ssilkin@webrtc.org> Reviewed-by: Danil Chapovalov <danilchap@webrtc.org> Cr-Commit-Position: refs/heads/main@{#41985}
2025-05-12 21:30:45 +01:00 · 2024-04-02 13:25:56 +02:00 · 2024-04-02 13:25:56 +02:00 · db36884e76
commit db36884e76
parent eea84d4953
4 changed files with 58 additions and 37 deletions
--- a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
+++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9.cc
@ -180,9 +180,6 @@ int VideoRtpDepacketizerVp9::ParseRtpPayload(
  video_header->simulcastIdx = 0;
  video_header->codec = kVideoCodecVP9;

-  video_header->frame_type =
-      p_bit ? VideoFrameType::kVideoFrameDelta : VideoFrameType::kVideoFrameKey;
-
  auto& vp9_header =
      video_header->video_type_header.emplace<RTPVideoHeaderVP9>();
  vp9_header.InitRTPVideoHeaderVP9();
@ -211,6 +208,9 @@ int VideoRtpDepacketizerVp9::ParseRtpPayload(
      video_header->height = vp9_header.height[0];
    }
  }
+  video_header->frame_type = p_bit || vp9_header.inter_layer_predicted
+                                 ? VideoFrameType::kVideoFrameDelta
+                                 : VideoFrameType::kVideoFrameKey;
  video_header->is_first_packet_in_frame = b_bit;
  video_header->is_last_packet_in_frame = e_bit;

--- a/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
+++ b/modules/rtp_rtcp/source/video_rtp_depacketizer_vp9_unittest.cc
@ -369,5 +369,18 @@ TEST(VideoRtpDepacketizerVp9Test, ReferencesInputCopyOnWriteBuffer) {
  // Compare pointers to check there was no copy on write buffer unsharing.
  EXPECT_EQ(parsed->video_payload.cdata(), rtp_payload.cdata() + kHeaderSize);
 }
+
+TEST(VideoRtpDepacketizerVp9Test, InterLayerPredOnlyFrameMerkedAsDelta) {
+  // Set P=0 and D=1 and vefify that the depaketizers marks this packet as a
+  // part of a delta frame (not a keyframe).
+  uint8_t packet[13] = {0};
+  packet[0] = 0b0010'0000;  // I:0 P:0 L:1 F:0 B:0 E:0 V:0 Z:0
+  packet[1] = 0b0000'0001;  // T:000 U:0 S:000 D:1
+  packet[2] = 0;            // TL0PICIDX
+
+  RTPVideoHeader video_header;
+  VideoRtpDepacketizerVp9::ParseRtpPayload(packet, &video_header);
+  EXPECT_EQ(video_header.frame_type, VideoFrameType::kVideoFrameDelta);
+}
 }  // namespace
 }  // namespace webrtc
--- a/modules/video_coding/rtp_vp9_ref_finder.cc
+++ b/modules/video_coding/rtp_vp9_ref_finder.cc
@ -132,24 +132,20 @@ RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameGof(
      FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted);
      return kHandOff;
    }
-  } else if (frame->frame_type() == VideoFrameType::kVideoFrameKey) {
-    if (frame->SpatialIndex() == 0) {
+  } else {
+    if (frame->frame_type() == VideoFrameType::kVideoFrameKey) {
      RTC_LOG(LS_WARNING) << "Received keyframe without scalability structure";
      return kDrop;
    }
-    const auto gof_info_it = gof_info_.find(unwrapped_tl0);
-    if (gof_info_it == gof_info_.end())
-      return kStash;

-    info = &gof_info_it->second;
-
-    frame->num_references = 0;
-    FrameReceivedVp9(frame->Id(), info);
-    FlattenFrameIdAndRefs(frame, codec_header.inter_layer_predicted);
-    return kHandOff;
-  } else {
-    auto gof_info_it = gof_info_.find(
-        (codec_header.temporal_idx == 0) ? unwrapped_tl0 - 1 : unwrapped_tl0);
+    // tl0_idx is incremented on temporal_idx=0 frames of the lowest spatial
+    // layer (which spatial_idx is not necessarily zero). Upper spatial layer
+    // frames with inter-layer prediction use GOF info of their base spatial
+    // layer frames.
+    const bool use_prev_gof =
+        codec_header.temporal_idx == 0 && !codec_header.inter_layer_predicted;
+    auto gof_info_it =
+        gof_info_.find(use_prev_gof ? unwrapped_tl0 - 1 : unwrapped_tl0);

    // Gof info for this frame is not available yet, stash this frame.
    if (gof_info_it == gof_info_.end())
@ -185,29 +181,29 @@ RtpVp9RefFinder::FrameDecision RtpVp9RefFinder::ManageFrameGof(
  auto up_switch_erase_to = up_switch_.lower_bound(old_picture_id);
  up_switch_.erase(up_switch_.begin(), up_switch_erase_to);

-  size_t diff =
-      ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start, frame->Id());
-  size_t gof_idx = diff % info->gof->num_frames_in_gof;
+  if (codec_header.inter_pic_predicted) {
+    size_t diff = ForwardDiff<uint16_t, kFrameIdLength>(info->gof->pid_start,
+                                                        frame->Id());
+    size_t gof_idx = diff % info->gof->num_frames_in_gof;

-  if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) {
-    return kDrop;
-  }
-  // Populate references according to the scalability structure.
-  frame->num_references = info->gof->num_ref_pics[gof_idx];
-  for (size_t i = 0; i < frame->num_references; ++i) {
-    frame->references[i] =
-        Subtract<kFrameIdLength>(frame->Id(), info->gof->pid_diff[gof_idx][i]);
-
-    // If this is a reference to a frame earlier than the last up switch point,
-    // then ignore this reference.
-    if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx,
-                              frame->references[i])) {
-      --frame->num_references;
+    if (info->gof->num_ref_pics[gof_idx] > EncodedFrame::kMaxFrameReferences) {
+      return kDrop;
    }
-  }

-  // Override GOF references.
-  if (!codec_header.inter_pic_predicted) {
+    // Populate references according to the scalability structure.
+    frame->num_references = info->gof->num_ref_pics[gof_idx];
+    for (size_t i = 0; i < frame->num_references; ++i) {
+      frame->references[i] = Subtract<kFrameIdLength>(
+          frame->Id(), info->gof->pid_diff[gof_idx][i]);
+
+      // If this is a reference to a frame earlier than the last up switch
+      // point, then ignore this reference.
+      if (UpSwitchInIntervalVp9(frame->Id(), codec_header.temporal_idx,
+                                frame->references[i])) {
+        --frame->num_references;
+      }
+    }
+  } else {
    frame->num_references = 0;
  }

--- a/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
+++ b/modules/video_coding/rtp_vp9_ref_finder_unittest.cc
@ -360,6 +360,18 @@ TEST_F(RtpVp9RefFinderTest, GofSkipFramesTemporalLayers_0212) {
  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(35, {30}));
 }

+TEST_F(RtpVp9RefFinderTest, GofInterLayerPredS0KeyS1Delta) {
+  GofInfoVP9 ss;
+  ss.SetGofInfoVP9(kTemporalStructureMode1);
+
+  Insert(Frame().Pid(1).SidAndTid(0, 0).Tl0(0).AsKeyFrame().Gof(&ss));
+  Insert(Frame().Pid(1).SidAndTid(1, 0).Tl0(0).AsInterLayer().NotAsInterPic());
+
+  ASSERT_EQ(2UL, frames_.size());
+  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(5, {}));
+  EXPECT_THAT(frames_, HasFrameWithIdAndRefs(6, {5}));
+}
+
 TEST_F(RtpVp9RefFinderTest, GofTemporalLayers_01) {
  GofInfoVP9 ss;
  ss.SetGofInfoVP9(kTemporalStructureMode2);  // 0101 pattern