Handle out-of-order frames in endIndices for MP4 with edit list

Updated logic to walk forward in the timestamps array to include all frames within the valid edit duration, accounting for out-of-order frames. This ensures that no frames with timestamps less than `editMediaTime` + `editDuration` are incorrectly excluded. Issue: #1797 PiperOrigin-RevId: 686075680
androidx · Oct 15, 2024 · 91c5633 · 91c5633
1 parent 9adb3aa
commit 91c5633
Show file tree

Hide file tree

Showing 13 changed files with 152 additions and 39 deletions.
diff --git a/RELEASENOTES.md b/RELEASENOTES.md
@@ -78,6 +78,8 @@
         timescale, `media_time` is now properly scaled using the track
         timescale, as specified by the MP4 format standard
         ([#1792](https://github.com/androidx/media/issues/1792)).
+    *   Handle out-of-order frames in `endIndices` calculation for MP4 with edit
+        list ([#1797](https://github.com/androidx/media/issues/1797)).
 *   DataSource:
 *   Audio:
     *   Fix pop sounds that may occur during seeks.

diff --git a/libraries/extractor/src/main/java/androidx/media3/extractor/mp4/BoxParser.java b/libraries/extractor/src/main/java/androidx/media3/extractor/mp4/BoxParser.java
@@ -717,22 +717,39 @@ public static TrackSampleTable parseStbl(
             Util.scaleLargeTimestamp(
                 track.editListDurations[i], track.timescale, track.movieTimescale);
         // The timestamps array is in the order read from the media, which might not be strictly
-        // sorted, but will ensure that a) all sync frames are in-order and b) any out-of-order
-        // frames are after their respective sync frames. This means that although the result of
-        // this binary search might be slightly incorrect (due to out-of-order timestamps), the loop
-        // below that walks backward to find the previous sync frame will result in a correct start
-        // index.
+        // sorted. However, all sync frames are guaranteed to be in order, and any out-of-order
+        // frames appear after their respective sync frames. This ensures that although the result
+        // of the binary search might not be entirely accurate (due to the out-of-order timestamps),
+        // the following logic ensures correctness for both start and end indices.
+        //
+        // The startIndices calculation finds the largest timestamp that is less than or equal to
+        // editMediaTime. It then walks backward to ensure the index points to a sync frame, since
+        // decoding must start from a keyframe.
         startIndices[i] =
             Util.binarySearchFloor(
                 timestamps, editMediaTime, /* inclusive= */ true, /* stayInBounds= */ true);
+        while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) {
+          startIndices[i]--;
+        }
+        // The endIndices calculation finds the smallest timestamp that is greater than
+        // editMediaTime + editDuration, except when omitZeroDurationClippedSample is true, in which
+        // case it finds the smallest timestamp that is greater than or equal to editMediaTime +
+        // editDuration.
         endIndices[i] =
             Util.binarySearchCeil(
                 timestamps,
                 editMediaTime + editDuration,
                 /* inclusive= */ omitZeroDurationClippedSample,
                 /* stayInBounds= */ false);
-        while (startIndices[i] >= 0 && (flags[startIndices[i]] & C.BUFFER_FLAG_KEY_FRAME) == 0) {
-          startIndices[i]--;
+        if (track.type == C.TRACK_TYPE_VIDEO) {
+          // To account for out-of-order video frames that may have timestamps smaller than or equal
+          // to editMediaTime + editDuration, but still fall within the valid range, the loop walks
+          // forward through the timestamps array to ensure all frames with timestamps within the
+          // edit duration are included.
+          while (endIndices[i] < timestamps.length - 1
+              && timestamps[endIndices[i] + 1] <= (editMediaTime + editDuration)) {
+            endIndices[i]++;
+          }
         }
         editedSampleCount += endIndices[i] - startIndices[i];
         copyMetadata |= nextSampleIndex != startIndices[i];

diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.0.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.0.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 3112471
-  sample count = 83
+  total output bytes = 3208515
+  sample count = 85
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -358,8 +358,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 82:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 83:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 84:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 45765
   sample count = 112

diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.1.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.1.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 2168517
-  sample count = 60
+  total output bytes = 2264561
+  sample count = 62
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -266,8 +266,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 59:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 60:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 61:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 30664
   sample count = 76

diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.2.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.2.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 1019852
-  sample count = 28
+  total output bytes = 1115896
+  sample count = 30
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -138,8 +138,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 27:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 28:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 29:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 15570
   sample count = 39

diff --git a/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.3.dump b/libraries/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.3.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 1019852
-  sample count = 28
+  total output bytes = 1115896
+  sample count = 30
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -138,8 +138,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 27:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 28:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 29:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 1239
   sample count = 3

diff --git a/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.0.dump b/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.0.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 3112471
-  sample count = 83
+  total output bytes = 3208515
+  sample count = 85
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -358,8 +358,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 82:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 83:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 84:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 45765
   sample count = 112

diff --git a/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.1.dump b/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.1.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 2168517
-  sample count = 60
+  total output bytes = 2264561
+  sample count = 62
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -266,8 +266,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 59:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 60:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 61:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 30664
   sample count = 76

diff --git a/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.2.dump b/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.2.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 1019852
-  sample count = 28
+  total output bytes = 1115896
+  sample count = 30
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -138,8 +138,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 27:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 28:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 29:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 15570
   sample count = 39

diff --git a/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.3.dump b/...ets/extractordumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.3.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 1019852
-  sample count = 28
+  total output bytes = 1115896
+  sample count = 30
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -138,8 +138,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 27:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 28:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 29:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 1239
   sample count = 3

diff --git a/...dumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.unknown_length.dump b/...dumps/mp4/sample_edit_list.mp4.reading_within_gop_sample_dependencies.unknown_length.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 3112471
-  sample count = 83
+  total output bytes = 3208515
+  sample count = 85
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -358,8 +358,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 82:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 83:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 84:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 45765
   sample count = 112

diff --git a/...ies/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.unknown_length.dump b/...ies/test_data/src/test/assets/extractordumps/mp4/sample_edit_list.mp4.unknown_length.dump
@@ -7,8 +7,8 @@ seekMap:
   getPosition(2548333) = [[timeUs=1680000, position=34939]]
 numberOfTracks = 2
 track 0:
-  total output bytes = 3112471
-  sample count = 83
+  total output bytes = 3208515
+  sample count = 85
   format 0:
     id = 1
     sampleMimeType = video/dolby-vision
@@ -358,8 +358,16 @@ track 0:
     data = length 23136, hash 8AF1C1AD
   sample 82:
     time = 2446666
-    flags = 536870912
+    flags = 0
     data = length 26792, hash 3157758F
+  sample 83:
+    time = 2613333
+    flags = 0
+    data = length 62711, hash EF9AC8F5
+  sample 84:
+    time = 2546666
+    flags = 536870912
+    data = length 33333, hash 567D33D6
 track 1:
   total output bytes = 45765
   sample count = 112

diff --git a/libraries/test_data/src/test/assets/playbackdumps/mp4/sample_edit_list.mp4.dump b/libraries/test_data/src/test/assets/playbackdumps/mp4/sample_edit_list.mp4.dump
@@ -793,7 +793,7 @@ MediaCodecAdapter (exotest.audio.aac):
       rendered = false
 MediaCodecAdapter (exotest.video.hevc):
   inputBuffers:
-    count = 84
+    count = 86
     input buffer #0:
       timeUs = 999999545000
       contents = length 78829, hash 9265686F
@@ -1044,11 +1044,17 @@ MediaCodecAdapter (exotest.video.hevc):
       timeUs = 1000002446666
       contents = length 26792, hash 3157758F
     input buffer #83:
+      timeUs = 1000002613333
+      contents = length 62711, hash EF9AC8F5
+    input buffer #84:
+      timeUs = 1000002546666
+      contents = length 33333, hash 567D33D6
+    input buffer #85:
       timeUs = 0
       flags = 4
       contents = length 0, hash 1
   outputBuffers:
-    count = 83
+    count = 85
     output buffer #0:
       timeUs = 999999545000
       size = 78829
@@ -1381,6 +1387,14 @@ MediaCodecAdapter (exotest.video.hevc):
       timeUs = 1000002446666
       size = 26792
       rendered = true
+    output buffer #83:
+      timeUs = 1000002613333
+      size = 62711
+      rendered = true
+    output buffer #84:
+      timeUs = 1000002546666
+      size = 33333
+      rendered = true
 AudioSink:
   buffer count = 112
   config: