Update to use Opus 1.5

2025-05-12 21:30:45 +01:00 · 2024-04-05 14:07:50 -07:00 · 2024-04-05 14:07:50 -07:00 · a170a82bb0
commit a170a82bb0
parent ed3f2f4c8a
3 changed files with 50 additions and 177 deletions
--- a/4
+++ b/4
@ -48,9 +48,9 @@ vars = {
 }

 deps = {
-  # RingRTC change to use a fork of opus
+  # RingRTC change to use a the upstream xiph opus
  'src/ringrtc/opus/src':
-    'https://github.com/signalapp/opus.git@593419e833acab4d15b4901fe156177fb7315468',
+    'https://github.com/xiph/opus.git@0e30966b198ad28943799eaf5b3b08100b6f70c3',

  # TODO(kjellander): Move this to be Android-only.
  'src/base':
--- a/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
+++ b/modules/audio_coding/codecs/opus/audio_encoder_opus.cc
@ -622,7 +622,19 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
  // After 20 DTX frames (MAX_CONSECUTIVE_DTX) Opus will send a frame
  // coding the background noise. Avoid flagging this frame as speech
  // (even though there is a probability of the frame being speech).
-  info.speech = IsPacketSpeech(info.encoded_bytes, encoded->data());
+  // RingRTC change to detect if an encoded packet contains speech or not.
+  if (WebRtcOpus_GetInDtx(inst_) == 0) {
+    info.speech = true;
+    consecutive_dtx_frames_ = 0;
+  } else {
+    // Handle the case where the encoder is now in DTX mode but there might be a speech frame in the packet.
+    if (consecutive_dtx_frames_ == 0 && info.encoded_bytes > 2) {
+      info.speech = true;
+    } else {
+      info.speech = false;
+    }
+    consecutive_dtx_frames_ += 1;
+  }

  info.encoder_type = CodecType::kOpus;

@ -632,177 +644,6 @@ AudioEncoder::EncodedInfo AudioEncoderOpusImpl::EncodeImpl(
  return info;
 }

-// RingRTC change to detect if an encoded packet contains speech or not.
-// Generally, if the last frame in the packet is audio, it is speech, unless
-// it is a DTX refresh frame. This function follows RFC-6716 to check frames
-// in each encoded packet.
-bool AudioEncoderOpusImpl::IsPacketSpeech(
-    int encoded_bytes,
-    const uint8_t* encoded) {
-  bool speech = false;
-
-  // If the encoder returns 0, 1, or 2 encoded bytes, by definition, the packet
-  // contains only DTX frame(s). 0 is the special case in which
-  // opus_interface.cc detected consecutive DTX packets and is instructing
-  // WebRTC not to send any packet out over the wire.
-  bool dtx_packet = (encoded_bytes <= 2);
-
-  if (config_.frame_size_ms > 20) {
-    // For packet times greater than 20ms, Opus will encode a group of 20ms
-    // frames and combine them into a 'packet' with a TOC.
-    if (dtx_packet) {
-      // The 'packet' contains only DTX frames.
-      consecutive_dtx_frames_ += config_.frame_size_ms / 20;
-    } else {
-      // The 'packet' contains at least one non-DTX frame.
-      if (((encoded[0] & 0x98) == 0x08) ||  // config values of 1, 5, 9, and 13
-          ((encoded[0] & 0x78) == 0x78) ||  // config value of 15
-          ((encoded[0] & 0x98) == 0x98)) {  // config values of 19, 23, 27, and 31
-        // The TOC indicates a packet with 20ms frames.
-        int code = encoded[0] & 0x03;
-        if (code == 0) {
-          // Code 0: 1 frame in the packet
-          // This case is unlikely for DTX.
-          consecutive_dtx_frames_ = 0;
-          speech = true;
-        } else if (code == 1) {
-          // Code 1: 2 frames in the packet, each with equal compressed size
-          // If both frames were DTX, we would not reach here.
-          consecutive_dtx_frames_ = 0;
-          speech = true;
-        } else if (code == 2) {
-          // Code 2: 2 frames in the packet, with different compressed sizes
-          int header_bytes = 2;
-          int size_of_first_frame = encoded[1];
-          if (size_of_first_frame > 251) {
-            size_of_first_frame += encoded[2] * 4;
-            header_bytes = 3;
-          }
-          int size_of_second_frame = encoded_bytes - size_of_first_frame - header_bytes;
-          if (size_of_first_frame > 0 && size_of_second_frame > 0) {
-            // The second frame has to be speech.
-            consecutive_dtx_frames_ = 0;
-            speech = true;
-          } else if (size_of_first_frame == 0 && size_of_second_frame > 0) {
-            // Second frame may or may not be DTX refresh.
-            speech = (consecutive_dtx_frames_ + 1) != 20;
-            consecutive_dtx_frames_ = 0;
-          } else if (size_of_first_frame > 0 && size_of_second_frame == 0) {
-            // First frame may or may not be DTX refresh.
-            consecutive_dtx_frames_ = 1;
-          } else {
-            // Both frames are size 0/DTX, should not reach here.
-            consecutive_dtx_frames_ += 2;
-          }
-        } else if (code == 3) {
-          // Code 3: an arbitrary number of frames in the packet
-          bool variable = (encoded[1] & 0x80) == 0x80;
-          bool padding = (encoded[1] & 0x40) == 0x40;
-          int M = encoded[1] & 0x3f;
-
-          int padding_header_bytes = 0;
-          int padding_size = 0;
-          if (padding) {
-            if (encoded[2] == 0xff) {
-              if (encoded_bytes < 4) {
-                // The packet should be at least 4 bytes, reset.
-                consecutive_dtx_frames_ = 0;
-                return true;
-              }
-              padding_size = 254 + encoded[3];
-              padding_header_bytes = 2;
-            } else {
-              padding_size = encoded[2];
-              padding_header_bytes = 1;
-            }
-          }
-
-          if (variable) {
-            // Frames in the packet have a variable size, a mix of audio and DTX.
-            int offset = 2 + padding_header_bytes;
-            int frame_header_bytes = 0;
-            int total_size_of_frames = 0;
-
-            // Check the worst-case limits to be sure there is enough encoded
-            // data to evaluate.
-            if (encoded_bytes < offset + M * 2) {
-              // Note: This assumes that actual encoded data is larger than
-              // the guess of two bytes for each header... Reset.
-              consecutive_dtx_frames_ = 0;
-              return true;
-            }
-
-            // The only time we walk the packet header to check for dynamic frame
-            // sizes. Only expected for packets with at least one DTX frame and
-            // at least one audio/refresh frame.
-            for (int frame = 0; frame < M - 1; frame++) {
-              int frame_size = encoded[offset];
-              if (frame_size > 251) {
-                frame_size += encoded[++offset] * 4;
-                frame_header_bytes += 2;
-              } else {
-                frame_header_bytes += 1;
-              }
-
-              if (frame_size > 0) {
-                // Could be speech or a DTX refresh frame. In either case,
-                // reset the DTX count.
-                consecutive_dtx_frames_ = 0;
-              } else {
-                // DTX frame.
-                consecutive_dtx_frames_++;
-              }
-
-              total_size_of_frames += frame_size;
-              offset++;
-            }
-
-            // Then, the last frame size should be:
-            int frame_M_size = encoded_bytes - 2
-                               - (padding_header_bytes + padding_size)
-                               - (frame_header_bytes + total_size_of_frames);
-            if (frame_M_size > 0) {
-              // The packet is ending, could be speech or a DTX refresh frame.
-              speech = consecutive_dtx_frames_ != 20;
-              consecutive_dtx_frames_ = 0;
-            } else if (frame_M_size == 0) {
-              // The packet is ending on a DTX frame.
-              consecutive_dtx_frames_++;
-            } else {
-              // Badly formatted packet, reset.
-              consecutive_dtx_frames_ = 0;
-              return true;
-            }
-          } else {
-            // Frames in the packet have a constant size.
-            int R = encoded_bytes - 2 - (padding_header_bytes + padding_size);
-            if (R > 0) {
-              // All frames are the same size and larger than zero, so they must
-              // represent speech.
-              consecutive_dtx_frames_ = 0;
-              speech = true;
-            } else {
-              // All frames are DTX.
-              consecutive_dtx_frames_ += M;
-            }
-          }
-        }
-      } else {
-        // The TOC indicates a packet with something other than 20ms frames.
-        // This does not match the supported frame sizing, reset and consider
-        // the packet to represent speech.
-        consecutive_dtx_frames_ = 0;
-        speech = true;
-      }
-    }
-  } else {
-    speech = !dtx_packet && (consecutive_dtx_frames_ != 20);
-    consecutive_dtx_frames_ = (dtx_packet) ? (consecutive_dtx_frames_ + 1) : (0);
-  }
-
-  return speech;
-}
-
 size_t AudioEncoderOpusImpl::Num10msFramesPerPacket() const {
  return static_cast<size_t>(rtc::CheckedDivExact(config_.frame_size_ms, 10));
 }
--- a/ringrtc/opus/BUILD.gn
+++ b/ringrtc/opus/BUILD.gn
@ -8,7 +8,7 @@ import("//testing/test.gni")
 # If ARM optimizations shall be used to accelerate performance.
 use_opus_arm_optimization =
    current_cpu == "arm" ||
-    (current_cpu == "arm64" && (is_fuchsia || is_ios || is_win))
+    (current_cpu == "arm64" && (is_fuchsia || is_ios || is_win || is_mac))

 # NaCl, unlike Chrome, doesn't target SSE2 minimum, so skip optimizations for
 # the sake of simplicity.
@ -30,6 +30,7 @@ config("opus_private_config") {
    "OPUS_BUILD",
    "OPUS_EXPORT=",
    "ENABLE_HARDENING",
+    "DISABLE_DEBUG_FLOAT",

    # Prefer alloca() over variable length arrays which are often inefficient;
    # the opus code will automatically handle this correctly per-platform.
@ -76,6 +77,7 @@ config("opus_private_config") {
      # Run Time CPU Detections (RTCD) is always enabled for x86.
      "OPUS_HAVE_RTCD",
      "CPU_INFO_BY_ASM",
+      "FLOAT_APPROX",

      # Chrome always targets SSE2+.
      "OPUS_X86_MAY_HAVE_SSE",
@ -88,7 +90,7 @@ config("opus_private_config") {

      # At present libopus has no AVX functions so no sources are add for this,
      # if you see linker errors on AVX code the this flag is why.
-      "OPUS_X86_MAY_HAVE_AVX",
+      "OPUS_X86_MAY_HAVE_AVX2",
    ]
  }

@ -187,11 +189,39 @@ if (use_opus_x86_optimization) {
      ":opus_config",
    ]

+    if (!is_debug) {
+      configs -= [ "//build/config/compiler:default_optimization" ]
+      configs += [ "//build/config/compiler:optimize_speed" ]
+    }
+
    if (!is_win || is_clang) {
      cflags = [ "-msse4.1" ]
    }
  }
-  # TODO(dalecurtis): If libopus ever adds AVX support, add an opus_avx block.
+
+  source_set("opus_avx2") {
+    sources = [
+      "src/celt/x86/pitch_avx.c",
+      "src/silk/x86/NSQ_del_dec_avx2.c",
+      "src/silk/float/x86/inner_product_FLP_avx2.c",
+    ]
+
+    configs -= [ "//build/config/compiler:chromium_code" ]
+    configs += [ "//build/config/compiler:no_chromium_code" ]
+    configs += [
+      ":opus_private_config",
+      ":opus_config",
+    ]
+
+    if (!is_debug) {
+      configs -= [ "//build/config/compiler:default_optimization" ]
+      configs += [ "//build/config/compiler:optimize_speed" ]
+    }
+
+    if (!is_win || is_clang) {
+      cflags = [ "-mavx", "-mfma", "-mavx2" ]
+    }
+  }
 }

 # Note: Do not add any defines or include_dirs to this target, those should all
@ -350,6 +380,7 @@ static_library("opus") {
    "src/silk/typedef.h",
    "src/src/analysis.c",
    "src/src/analysis.h",
+    "src/src/extensions.c",
    "src/src/mapping_matrix.c",
    "src/src/mapping_matrix.h",
    "src/src/mlp.c",
@ -437,6 +468,7 @@ static_library("opus") {
      "src/silk/x86/x86_silk_map.c",
    ]
    deps += [ ":opus_sse41" ]
+    deps += [ ":opus_avx2" ]
  }

  if (use_opus_arm_optimization) {