From c71dbe9825da45ddf6e87c2e5eafdec74717417e Mon Sep 17 00:00:00 2001
From: Luca Rossetto <rossetto@ifi.uzh.ch>
Date: Fri, 6 Jan 2023 15:24:19 +0100
Subject: [PATCH 1/4] First iteration of decoder based on external ffmpeg
 binary. Does not yet support audio or image scaling.

---
 cineast-core/build.gradle                     |   2 +
 .../core/data/frames/AudioDescriptor.java     |  57 +----
 .../cineast/core/data/frames/AudioFrame.java  |  34 +--
 .../core/data/frames/VideoDescriptor.java     |  53 +---
 .../containers/AudioQueryTermContainer.java   |   7 +-
 .../core/data/segments/AudioSegment.java      |  13 +-
 .../core/data/segments/VideoSegment.java      |  19 +-
 .../video/FFMpegProcessVideoDecoder.java      | 233 ++++++++++++++++++
 .../decode/video/FFMpegVideoDecoder.java      |  27 +-
 .../run/GenericExtractionItemHandler.java     |  49 ++--
 .../runtime/ExtractionPipeline.java           |  18 +-
 11 files changed, 330 insertions(+), 182 deletions(-)
 create mode 100644 cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java

diff --git a/cineast-core/build.gradle b/cineast-core/build.gradle
index 826128ce7..c5e9085bf 100644
--- a/cineast-core/build.gradle
+++ b/cineast-core/build.gradle
@@ -144,6 +144,8 @@ dependencies {
     api group: "org.bytedeco", name: "javacpp", version: version_javacpp
     api group: "org.bytedeco", name: "ffmpeg-platform", version: version_ffmpeg
 
+    api group: 'com.github.kokorin.jaffree', name: 'jaffree', version: '2022.06.03'
+
     /** OpenCV. */
     api group: 'org.openpnp', name: 'opencv', version: version_opencv
 
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioDescriptor.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioDescriptor.java
index 927fc29c6..7b6ef4597 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioDescriptor.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioDescriptor.java
@@ -2,59 +2,12 @@
 
 /**
  * The class encapsulates descriptive information concerning an audio-stream that does not change between frames. The intention behind this class is that {@link AudioFrame}s that belong together share the same instance of the AudioDescriptor.
+ *
+ * @param samplingrate Samplingrate of the audio associated with this descriptor.
+ * @param channels     Number of channels in the audio associated with this descriptor.
+ * @param duration     Duration of the audio associated with this descriptor in milliseconds.
  */
-public class AudioDescriptor {
-
-  /**
-   * Samplingrate of the audio associated with this descriptor.
-   */
-  private final float samplingrate;
-
-  /**
-   * Number of channels in the audio associated with this descriptor.
-   */
-  private final int channels;
-
-  /**
-   * Duration of the audio associated with this descriptor in milliseconds.
-   */
-  private final long duration;
-
-  /**
-   * Constructor for an AudioDescriptor.
-   */
-  public AudioDescriptor(float samplingrate, int channels, long duration) {
-    this.samplingrate = samplingrate;
-    this.channels = channels;
-    this.duration = duration;
-  }
-
-  /**
-   * Getter for the samplingrate.
-   *
-   * @return Samplingrate of the source stream.
-   */
-  public final float getSamplingrate() {
-    return this.samplingrate;
-  }
-
-  /**
-   * Getter for channels.
-   *
-   * @return Number of channels in the source stream
-   */
-  public final int getChannels() {
-    return this.channels;
-  }
-
-  /**
-   * Getter for duration.
-   *
-   * @return Duration of the total source stream
-   */
-  public final long getDuration() {
-    return this.duration;
-  }
+public record AudioDescriptor(float samplingrate, int channels, long duration) {
 
   @Override
   public boolean equals(Object o) {
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioFrame.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioFrame.java
index c320c7973..948d85bc4 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioFrame.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/AudioFrame.java
@@ -1,10 +1,10 @@
 package org.vitrivr.cineast.core.data.frames;
 
 
+import javax.sound.sampled.AudioFormat;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.util.Objects;
-import javax.sound.sampled.AudioFormat;
 
 /**
  * Represents a single audio-frame containing a specific number of samples (the number depends on the decoder that created the AudioFrame). Sample data is stored in a byte array and internally represented as 16bit int PCM i.e. each sample is represented by a signed 16bit short between -32767 and 32767.
@@ -64,7 +64,7 @@ public AudioFrame(long idx, long timestamp, byte[] data, AudioDescriptor descrip
   }
 
   public AudioFrame(AudioFrame other) {
-    this(other.idx, other.timestamp, other.data.array(), new AudioDescriptor(other.descriptor.getSamplingrate(), other.descriptor.getChannels(), other.descriptor.getDuration()));
+    this(other.idx, other.timestamp, other.data.array(), new AudioDescriptor(other.descriptor.samplingrate(), other.descriptor.channels(), other.descriptor.duration()));
   }
 
   /**
@@ -73,7 +73,7 @@ public AudioFrame(AudioFrame other) {
    * @return AudioFormat
    */
   public final AudioFormat getFormat() {
-    return new AudioFormat(this.descriptor.getSamplingrate(), BITS_PER_SAMPLE, this.descriptor.getChannels(), true, false);
+    return new AudioFormat(this.descriptor.samplingrate(), BITS_PER_SAMPLE, this.descriptor.channels(), true, false);
   }
 
   /**
@@ -137,7 +137,7 @@ public final byte[] getData() {
    */
   private void setData(byte[] data) {
     this.data = ByteBuffer.wrap(data).order(ByteOrder.LITTLE_ENDIAN);
-    this.numberOfSamples = data.length / (2 * this.descriptor.getChannels());
+    this.numberOfSamples = data.length / (2 * this.descriptor.channels());
   }
 
   /**
@@ -146,7 +146,7 @@ private void setData(byte[] data) {
    * @return Sample rate of this AudioFrame.
    */
   public final float getSamplingrate() {
-    return this.descriptor.getSamplingrate();
+    return this.descriptor.samplingrate();
   }
 
   /**
@@ -155,7 +155,7 @@ public final float getSamplingrate() {
    * @return Duration of the {@link AudioFrame}
    */
   public final float getDuration() {
-    return this.numberOfSamples / this.descriptor.getSamplingrate();
+    return this.numberOfSamples / this.descriptor.samplingrate();
   }
 
   /**
@@ -173,7 +173,7 @@ public final float getStart() {
    * @return Relative end of the {@link AudioFrame}.
    */
   public final float getEnd() {
-    return this.getStart() + this.numberOfSamples / this.descriptor.getSamplingrate();
+    return this.getStart() + this.numberOfSamples / this.descriptor.samplingrate();
   }
 
   /**
@@ -182,7 +182,7 @@ public final float getEnd() {
    * @return Number of channels in this AudioFrame.
    */
   public final int getChannels() {
-    return this.descriptor.getChannels();
+    return this.descriptor.channels();
   }
 
   /**
@@ -193,8 +193,8 @@ public final int getChannels() {
    * @return Sample value for the specified channel at the specified index.
    */
   public final short getSampleAsShort(int idx, int channel) {
-    if (channel < this.descriptor.getChannels()) {
-      return this.data.getShort(2 * idx * this.descriptor.getChannels() + 2 * channel);
+    if (channel < this.descriptor.channels()) {
+      return this.data.getShort(2 * idx * this.descriptor.channels() + 2 * channel);
     } else {
       throw new IllegalArgumentException("The channel indexed must not exceed the number of channels!");
     }
@@ -219,10 +219,10 @@ public final double getSampleAsDouble(int idx, int channel) {
    */
   public final short getMeanSampleAsShort(int idx) {
     int meanSample = 0;
-    for (int i = 0; i < this.descriptor.getChannels(); i++) {
+    for (int i = 0; i < this.descriptor.channels(); i++) {
       meanSample += this.getSampleAsShort(idx, i);
     }
-    return (short) (meanSample / this.descriptor.getChannels());
+    return (short) (meanSample / this.descriptor.channels());
   }
 
   /**
@@ -233,10 +233,10 @@ public final short getMeanSampleAsShort(int idx) {
    */
   public final double getMeanSampleAsDouble(int idx) {
     float meanSample = 0;
-    for (int i = 0; i < this.descriptor.getChannels(); i++) {
+    for (int i = 0; i < this.descriptor.channels(); i++) {
       meanSample += this.getSampleAsShort(idx, i);
     }
-    return (meanSample / (this.descriptor.getChannels() * Short.MAX_VALUE));
+    return (meanSample / (this.descriptor.channels() * Short.MAX_VALUE));
   }
 
   /**
@@ -250,7 +250,7 @@ public boolean append(AudioFrame that, int numberOfSamples) {
     if (!this.descriptor.equals(that.descriptor)) {
       return false;
     }
-    int bytes = that.descriptor.getChannels() * numberOfSamples * (BITS_PER_SAMPLE / 8);
+    int bytes = that.descriptor.channels() * numberOfSamples * (BITS_PER_SAMPLE / 8);
     if (bytes > that.data.capacity()) {
       return false;
     }
@@ -281,7 +281,7 @@ public AudioFrame split(int numberOfSamples) {
       return this;
     }
 
-    int bytesToCut = this.descriptor.getChannels() * numberOfSamples * (BITS_PER_SAMPLE / 8);
+    int bytesToCut = this.descriptor.channels() * numberOfSamples * (BITS_PER_SAMPLE / 8);
     byte[] cutBytes = new byte[bytesToCut];
     byte[] remaining = new byte[this.data.capacity() - bytesToCut];
 
@@ -290,7 +290,7 @@ public AudioFrame split(int numberOfSamples) {
 
     setData(remaining);
 
-    return new AudioFrame(idx, timestamp, cutBytes, new AudioDescriptor(descriptor.getSamplingrate(), descriptor.getChannels(), (long) (numberOfSamples / descriptor.getSamplingrate())));
+    return new AudioFrame(idx, timestamp, cutBytes, new AudioDescriptor(descriptor.samplingrate(), descriptor.channels(), (long) (numberOfSamples / descriptor.samplingrate())));
   }
 
   @Override
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/VideoDescriptor.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/VideoDescriptor.java
index 88a1b31f0..bd1217c8c 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/VideoDescriptor.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/frames/VideoDescriptor.java
@@ -2,54 +2,13 @@
 
 /**
  * The class encapsulates descriptive information concerning a video-stream (visual only) that does not change between frames. The intention behind this class is that {@link VideoFrame}s that belong together share the same instance of the AudioDescriptor.
+ *
+ * @param fps      Frame rate of the video associated with this descriptor.
+ * @param duration Duration of the video associated with this descriptor in milliseconds.
+ * @param width    Width of the video associated with this descriptor.
+ * @param height   Height of the video associated with this descriptor.
  */
-public class VideoDescriptor {
-
-  /**
-   * Frame rate of the video associated with this descriptor.
-   */
-  private final float fps;
-
-  /**
-   * Duration of the video associated with this descriptor in milliseconds.
-   */
-  private final long duration;
-
-  /**
-   * Width of the video associated with this descriptor.
-   */
-  private final int width;
-
-  /**
-   * Height of the video associated with this descriptor.
-   */
-  private final int height;
-
-  /**
-   * Constructor for VideoDescriptor
-   */
-  public VideoDescriptor(float fps, long duration, int width, int height) {
-    this.fps = fps;
-    this.duration = duration;
-    this.width = width;
-    this.height = height;
-  }
-
-  public float getFps() {
-    return fps;
-  }
-
-  public long getDuration() {
-    return duration;
-  }
-
-  public int getWidth() {
-    return width;
-  }
-
-  public int getHeight() {
-    return height;
-  }
+public record VideoDescriptor(float fps, long duration, int width, int height) {
 
   @Override
   public boolean equals(Object o) {
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/query/containers/AudioQueryTermContainer.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/query/containers/AudioQueryTermContainer.java
index f2f7cc0c7..5e1878c9d 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/query/containers/AudioQueryTermContainer.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/query/containers/AudioQueryTermContainer.java
@@ -1,13 +1,14 @@
 package org.vitrivr.cineast.core.data.query.containers;
 
-import java.util.List;
-import java.util.Objects;
 import org.vitrivr.cineast.core.data.frames.AudioDescriptor;
 import org.vitrivr.cineast.core.data.frames.AudioFrame;
 import org.vitrivr.cineast.core.util.dsp.fft.STFT;
 import org.vitrivr.cineast.core.util.dsp.fft.windows.WindowFunction;
 import org.vitrivr.cineast.core.util.web.AudioParser;
 
+import java.util.List;
+import java.util.Objects;
+
 
 public class AudioQueryTermContainer extends AbstractQueryTermContainer {
 
@@ -94,7 +95,7 @@ public STFT getSTFT(int windowsize, int overlap, int padding, WindowFunction fun
     if (2 * padding >= windowsize) {
       throw new IllegalArgumentException("The combined padding must be smaller than the sample window.");
     }
-    STFT stft = new STFT(windowsize, overlap, padding, function, this.descriptor.getSamplingrate());
+    STFT stft = new STFT(windowsize, overlap, padding, function, this.descriptor.samplingrate());
     stft.forward(this.getMeanSamplesAsDouble());
     return stft;
   }
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/AudioSegment.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/AudioSegment.java
index 5d3502fc3..351e00bf6 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/AudioSegment.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/AudioSegment.java
@@ -1,13 +1,14 @@
 package org.vitrivr.cineast.core.data.segments;
 
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
 import org.vitrivr.cineast.core.data.frames.AudioDescriptor;
 import org.vitrivr.cineast.core.data.frames.AudioFrame;
 import org.vitrivr.cineast.core.util.dsp.fft.STFT;
 import org.vitrivr.cineast.core.util.dsp.fft.windows.WindowFunction;
 
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
 /**
  * This AudioSegment is part of the Cineast data model and can hold an arbitrary number of AudioFrames that somehow belong together. The class itself is agnostic to how segmenting is organized.
  * <p>
@@ -119,12 +120,12 @@ public float getAudioDuration() {
 
   @Override
   public float getSamplingrate() {
-    return this.descriptor.getSamplingrate();
+    return this.descriptor.samplingrate();
   }
 
   @Override
   public int getChannels() {
-    return this.descriptor.getChannels();
+    return this.descriptor.channels();
   }
 
   /**
@@ -189,7 +190,7 @@ public STFT getSTFT(int windowsize, int overlap, int padding, WindowFunction fun
     if (2 * padding >= windowsize) {
       throw new IllegalArgumentException("The combined padding must be smaller than the sample window.");
     }
-    STFT stft = new STFT(windowsize, overlap, padding, function, this.descriptor.getSamplingrate());
+    STFT stft = new STFT(windowsize, overlap, padding, function, this.descriptor.samplingrate());
     stft.forward(this.getMeanSamplesAsDouble());
     return stft;
   }
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/VideoSegment.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/VideoSegment.java
index 05bc0b80f..727910359 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/VideoSegment.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/data/segments/VideoSegment.java
@@ -2,10 +2,6 @@
 
 import boofcv.struct.geo.AssociatedPair;
 import georegression.struct.point.Point2D_F32;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.LinkedList;
-import java.util.List;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.vitrivr.cineast.core.data.Pair;
@@ -23,6 +19,11 @@
 import org.vitrivr.cineast.core.util.dsp.fft.STFT;
 import org.vitrivr.cineast.core.util.dsp.fft.windows.WindowFunction;
 
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedList;
+import java.util.List;
+
 public class VideoSegment implements SegmentContainer {
 
   private static final Logger LOGGER = LogManager.getLogger();
@@ -157,7 +158,7 @@ public float getAudioDuration() {
   @Override
   public float getSamplingrate() {
     if (this.audioDescriptor != null) {
-      return this.audioDescriptor.getSamplingrate();
+      return this.audioDescriptor.samplingrate();
     } else {
       return 0;
     }
@@ -169,7 +170,7 @@ public float getSamplingrate() {
   @Override
   public int getChannels() {
     if (this.audioDescriptor != null) {
-      return this.audioDescriptor.getChannels();
+      return this.audioDescriptor.channels();
     } else {
       return 0;
     }
@@ -192,7 +193,7 @@ public STFT getSTFT(int windowsize, int overlap, int padding, WindowFunction fun
     if (2 * padding >= windowsize) {
       throw new IllegalArgumentException("The combined padding must be smaller than the sample window.");
     }
-    STFT stft = new STFT(windowsize, overlap, padding, function, this.audioDescriptor.getSamplingrate());
+    STFT stft = new STFT(windowsize, overlap, padding, function, this.audioDescriptor.samplingrate());
     stft.forward(this.getMeanSamplesAsDouble());
     return stft;
   }
@@ -334,7 +335,7 @@ public int getEnd() {
    */
   @Override
   public float getRelativeStart() {
-    return (1000.0f * this.getStart()) / this.videoDescriptor.getDuration();
+    return (1000.0f * this.getStart()) / this.videoDescriptor.duration();
   }
 
   /**
@@ -342,7 +343,7 @@ public float getRelativeStart() {
    */
   @Override
   public float getRelativeEnd() {
-    return (1000.0f * this.getEnd()) / this.videoDescriptor.getDuration();
+    return (1000.0f * this.getEnd()) / this.videoDescriptor.duration();
   }
 
   /**
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
new file mode 100644
index 000000000..724af35d0
--- /dev/null
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
@@ -0,0 +1,233 @@
+package org.vitrivr.cineast.core.extraction.decode.video;
+
+import com.github.kokorin.jaffree.StreamType;
+import com.github.kokorin.jaffree.ffmpeg.*;
+import com.github.kokorin.jaffree.ffprobe.FFprobe;
+import com.github.kokorin.jaffree.ffprobe.FFprobeResult;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.vitrivr.cineast.core.config.CacheConfig;
+import org.vitrivr.cineast.core.config.DecoderConfig;
+import org.vitrivr.cineast.core.data.frames.AudioDescriptor;
+import org.vitrivr.cineast.core.data.frames.VideoDescriptor;
+import org.vitrivr.cineast.core.data.frames.VideoFrame;
+import org.vitrivr.cineast.core.data.raw.CachedDataFactory;
+import org.vitrivr.cineast.core.data.raw.images.MultiImage;
+import org.vitrivr.cineast.core.extraction.decode.general.Decoder;
+
+import java.awt.image.BufferedImage;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.LinkedBlockingQueue;
+
+public class FFMpegProcessVideoDecoder implements Decoder<VideoFrame> {
+
+    /**
+     * Lists the mime types supported by the FFMpegVideoDecoder.
+     * <p>
+     * TODO: List may not be complete yet.
+     */
+    public static final Set<String> supportedFiles = Set.of("multimedia/mp4", "video/mp4", "video/avi", "video/mpeg", "video/quicktime", "video/webm");
+    /**
+     * Configuration property name for the {@link FFMpegVideoDecoder}: max width of the converted video.
+     */
+    private static final String CONFIG_MAXWIDTH_PROPERTY = "maxFrameWidth";
+    /**
+     * Configuration property name for the {@link FFMpegVideoDecoder}: max height of the converted video.
+     */
+    private static final String CONFIG_HEIGHT_PROPERTY = "maxFrameHeight";
+    /**
+     * Configuration property name for the {@link FFMpegVideoDecoder}: number of channels of the converted audio. If <= 0, then no audio will be decoded.
+     */
+    private static final String CONFIG_CHANNELS_PROPERTY = "channels";
+    /**
+     * Configuration property name for the {@link FFMpegVideoDecoder}: samplerate of the converted audio.
+     */
+    private static final String CONFIG_SAMPLERATE_PROPERTY = "samplerate";
+    /**
+     * Configuration property name for the {@link FFMpegVideoDecoder}: Indicates whether subtitles should be decoded as well.
+     */
+    private static final String CONFIG_SUBTITLE_PROPERTY = "subtitles";
+    /**
+     * Configuration property default for the FFMpegVideoDecoder: max width of the converted video.
+     */
+    private final static int CONFIG_MAXWIDTH_DEFAULT = 1920;
+    /**
+     * Configuration property default for the FFMpegVideoDecoder: max height of the converted video.
+     */
+    private final static int CONFIG_MAXHEIGHT_DEFAULT = 1080;
+    /**
+     * Configuration property default for the FFMpegVideoDecoder: number of channels of the converted audio.
+     */
+    private static final int CONFIG_CHANNELS_DEFAULT = 1;
+    /**
+     * Configuration property default for the FFMpegVideoDecoder: sample rate of the converted audio
+     */
+    private static final int CONFIG_SAMPLERATE_DEFAULT = 44100;
+
+    private final Path ffmpegPath = Path.of("ffmpeg");
+
+    private FFmpegResultFuture future = null;
+    private int estimatedFrameCount = 0;
+
+    private static final Logger LOGGER = LogManager.getLogger();
+    private CachedDataFactory factory;
+
+    private final LinkedBlockingQueue<VideoFrame> videoFrameQueue = new LinkedBlockingQueue<>(10);
+
+    @Override
+    public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheConfig) {
+
+        if (!Files.exists(path)) {
+            LOGGER.error("File does not exist {}", path.toString());
+            return false;
+        }
+
+        /* Initialize MultiImageFactory using the ImageCacheConfig. */
+        if (cacheConfig == null) {
+            LOGGER.error("You must provide a valid ImageCacheConfig when initializing the FFMpegVideoDecoder.");
+            return false;
+        }
+        this.factory = cacheConfig.sharedCachedDataFactory();
+
+        //checking container and stream information
+
+        FFprobeResult ffprobeResult = FFprobe.atPath(ffmpegPath).setInput(path).setShowStreams(true).execute();
+
+        VideoDescriptor videoDescriptor = null;
+        final HashMap<Integer, AudioDescriptor> audioDescriptors = new HashMap<>();
+
+        for (com.github.kokorin.jaffree.ffprobe.Stream stream: ffprobeResult.getStreams()) {
+            if (stream.getCodecType() == StreamType.VIDEO) {
+                videoDescriptor = new VideoDescriptor(stream.getAvgFrameRate().floatValue(), Math.round(stream.getDuration() * 1000d), stream.getWidth(), stream.getHeight());
+                if (stream.getNbFrames() != null) {
+                    this.estimatedFrameCount = stream.getNbFrames();
+                }
+                continue;
+            }
+            if (stream.getCodecType() == StreamType.AUDIO) {
+                AudioDescriptor descriptor = new AudioDescriptor(stream.getSampleRate().floatValue(), stream.getChannels(), Math.round(stream.getDuration() * 1000d));
+                audioDescriptors.put(stream.getIndex(), descriptor);
+            }
+        }
+
+        if (videoDescriptor == null) {
+            LOGGER.error("No video stream found in {}", path.toString());
+            return false;
+        }
+
+        final int maxWidth = decoderConfig.namedAsInt(CONFIG_MAXWIDTH_PROPERTY, CONFIG_MAXWIDTH_DEFAULT);
+        final int maxHeight = decoderConfig.namedAsInt(CONFIG_HEIGHT_PROPERTY, CONFIG_MAXHEIGHT_DEFAULT);
+
+        VideoDescriptor finalVideoDescriptor = videoDescriptor;
+        future = FFmpeg.atPath(ffmpegPath)
+                .addInput(UrlInput.fromPath(path))
+                .addOutput(FrameOutput.withConsumer(
+                        new FrameConsumer() {
+
+                            final HashMap<Integer, Stream> streamHashMap = new HashMap<>();
+                            int frameCounter = 0;
+
+                            @Override
+                            public void consumeStreams(List<Stream> streams) {
+                                for (Stream stream : streams) {
+                                    streamHashMap.put(stream.getId(), stream);
+                                }
+                            }
+
+                            @Override
+                            public void consume(Frame frame) {
+
+                                Stream stream = streamHashMap.get(frame.getStreamId());
+
+                                if (stream == null) {
+                                    //no information about the stream, ignore frame
+                                    LOGGER.debug("received frame from unknown stream {}, ignoring", frame.getStreamId());
+                                    return;
+                                }
+
+                                switch (stream.getType()) {
+
+                                    case VIDEO -> {
+
+                                        BufferedImage bimg = frame.getImage();
+
+                                        if (bimg.getWidth() > maxWidth || bimg.getHeight() > maxHeight) {
+                                            //TODO rescale
+                                        }
+
+                                        MultiImage image = factory.newMultiImage(bimg);
+                                        VideoFrame videoFrame = new VideoFrame(frameCounter++, (1000 * frame.getPts()) / stream.getTimebase(), image, finalVideoDescriptor);
+
+                                        try {
+                                            videoFrameQueue.put(videoFrame);
+                                        } catch (InterruptedException e) {
+                                            LOGGER.error("Could not enqueue frame", e);
+                                        }
+
+                                        break;
+                                    }
+                                    case AUDIO -> {
+
+                                        //TODO audio data conversion
+
+                                        break;
+                                    }
+                                }
+
+
+                            }
+                        }
+                ))
+                .executeAsync();
+
+
+        return true;
+    }
+
+    @Override
+    public void close() {
+
+        if (this.future != null) {
+            this.future.graceStop();
+            this.future = null;
+        }
+
+    }
+
+    @Override
+    public VideoFrame getNext() {
+        if (this.complete()) {
+            return null;
+        }
+        try {
+            return videoFrameQueue.take();
+        } catch (InterruptedException e) {
+            return null;
+        }
+
+    }
+
+    @Override
+    public int count() {
+        return estimatedFrameCount;
+    }
+
+    @Override
+    public boolean complete() {
+        return (this.future == null || this.future.isDone() || this.future.isCancelled()) && this.videoFrameQueue.isEmpty();
+    }
+
+    @Override
+    public Set<String> supportedFiles() {
+        return supportedFiles;
+    }
+
+    @Override
+    public boolean canBeReused() {
+        return false;
+    }
+}
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegVideoDecoder.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegVideoDecoder.java
index 32c9bdd03..08e6bc064 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegVideoDecoder.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegVideoDecoder.java
@@ -1,13 +1,5 @@
 package org.vitrivr.cineast.core.extraction.decode.video;
 
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.ArrayDeque;
-import java.util.Optional;
-import java.util.Set;
-import java.util.concurrent.atomic.AtomicBoolean;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.bytedeco.ffmpeg.avcodec.AVCodec;
@@ -24,11 +16,7 @@
 import org.bytedeco.ffmpeg.global.swscale;
 import org.bytedeco.ffmpeg.swresample.SwrContext;
 import org.bytedeco.ffmpeg.swscale.SwsContext;
-import org.bytedeco.javacpp.BytePointer;
-import org.bytedeco.javacpp.DoublePointer;
-import org.bytedeco.javacpp.IntPointer;
-import org.bytedeco.javacpp.Pointer;
-import org.bytedeco.javacpp.PointerPointer;
+import org.bytedeco.javacpp.*;
 import org.vitrivr.cineast.core.config.CacheConfig;
 import org.vitrivr.cineast.core.config.DecoderConfig;
 import org.vitrivr.cineast.core.data.frames.AudioDescriptor;
@@ -43,6 +31,15 @@
 import org.vitrivr.cineast.core.extraction.decode.subtitle.SubtitleItem;
 import org.vitrivr.cineast.core.util.LogHelper;
 
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayDeque;
+import java.util.Optional;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicBoolean;
+
 /**
  * A {@link Decoder} implementation that decodes videos using the ffmpeg library + the corresponding Java bindings.
  */
@@ -299,7 +296,7 @@ private void readVideo() {
     }
 
     /* Prepare frame and associated timestamp and add it to output queue. */
-    VideoFrame videoFrame = new VideoFrame(this.pCodecCtxVideo.frame_number(), this.getFrameTimestamp(this.videoStream), this.factory.newMultiImage(this.videoDescriptor.getWidth(), this.videoDescriptor.getHeight(), pixels), this.videoDescriptor);
+    VideoFrame videoFrame = new VideoFrame(this.pCodecCtxVideo.frame_number(), this.getFrameTimestamp(this.videoStream), this.factory.newMultiImage(this.videoDescriptor.width(), this.videoDescriptor.height(), pixels), this.videoDescriptor);
     this.videoFrameQueue.add(videoFrame);
   }
 
@@ -656,7 +653,7 @@ public int count() {
   }
 
   /**
-   * Indicates whether or not the current decoder has more content to return or not.
+   * Indicates whether the current decoder has more content to return or not.
    *
    * @return True if more content can be fetched, false otherwise.
    */
diff --git a/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/run/GenericExtractionItemHandler.java b/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/run/GenericExtractionItemHandler.java
index df16ab9f9..f0711a107 100644
--- a/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/run/GenericExtractionItemHandler.java
+++ b/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/run/GenericExtractionItemHandler.java
@@ -1,20 +1,6 @@
 package org.vitrivr.cineast.standalone.run;
 
 import com.google.common.collect.Sets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Optional;
-import java.util.Set;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.TimeUnit;
-import java.util.function.Supplier;
-import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.tuple.ImmutablePair;
 import org.apache.commons.lang3.tuple.Pair;
@@ -42,7 +28,7 @@
 import org.vitrivr.cineast.core.extraction.decode.image.DefaultImageDecoder;
 import org.vitrivr.cineast.core.extraction.decode.image.ImageSequenceDecoder;
 import org.vitrivr.cineast.core.extraction.decode.m3d.ModularMeshDecoder;
-import org.vitrivr.cineast.core.extraction.decode.video.FFMpegVideoDecoder;
+import org.vitrivr.cineast.core.extraction.decode.video.FFMpegProcessVideoDecoder;
 import org.vitrivr.cineast.core.extraction.idgenerator.ObjectIdGenerator;
 import org.vitrivr.cineast.core.extraction.metadata.MetadataExtractor;
 import org.vitrivr.cineast.core.extraction.segmenter.audio.ConstantLengthAudioSegmenter;
@@ -50,7 +36,7 @@
 import org.vitrivr.cineast.core.extraction.segmenter.general.Segmenter;
 import org.vitrivr.cineast.core.extraction.segmenter.image.ImageSegmenter;
 import org.vitrivr.cineast.core.extraction.segmenter.image.ImageSequenceSegmenter;
-import org.vitrivr.cineast.core.extraction.segmenter.video.VideoHistogramSegmenter;
+import org.vitrivr.cineast.core.extraction.segmenter.video.ConstantLengthVideoSegmenter;
 import org.vitrivr.cineast.core.features.abstracts.MetadataFeatureModule;
 import org.vitrivr.cineast.core.util.LogHelper;
 import org.vitrivr.cineast.core.util.MimeTypeHelper;
@@ -58,6 +44,16 @@
 import org.vitrivr.cineast.standalone.config.Config;
 import org.vitrivr.cineast.standalone.runtime.ExtractionPipeline;
 
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.*;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.TimeUnit;
+import java.util.function.Supplier;
+import java.util.stream.Collectors;
+
 /**
  * This class is used to extract a continuous list of {@link ExtractionItemContainer}s.
  * <p>
@@ -124,7 +120,8 @@ public GenericExtractionItemHandler(ExtractionContainerProvider pathProvider, Ex
     handlers.put(MediaType.IMAGE, new ImmutablePair<>(DefaultImageDecoder::new, () -> new ImageSegmenter(context)));
     handlers.put(MediaType.IMAGE_SEQUENCE, new ImmutablePair<>(ImageSequenceDecoder::new, () -> new ImageSequenceSegmenter(context)));
     handlers.put(MediaType.AUDIO, new ImmutablePair<>(FFMpegAudioDecoder::new, () -> new ConstantLengthAudioSegmenter(context)));
-    handlers.put(MediaType.VIDEO, new ImmutablePair<>(FFMpegVideoDecoder::new, () -> new VideoHistogramSegmenter(context)));
+    //handlers.put(MediaType.VIDEO, new ImmutablePair<>(FFMpegVideoDecoder::new, () -> new VideoHistogramSegmenter(context)));
+    handlers.put(MediaType.VIDEO, new ImmutablePair<>(FFMpegProcessVideoDecoder::new, () -> new ConstantLengthVideoSegmenter(1f))); //FIXME make properly configurable
     handlers.put(MediaType.MODEL3D, new ImmutablePair<>(ModularMeshDecoder::new, () -> new PassthroughSegmenter<Mesh>() {
       @Override
       protected SegmentContainer getSegmentFromContent(Mesh content) {
@@ -139,10 +136,12 @@ public Set<MediaType> getMediaTypes() {
     // #353: Respect the given segmenter
     final Set<MediaType> segmenterTypes;
     final Segmenter<Object> segmenter = context.newSegmenter();
-    segmenterTypes = segmenter.getMediaTypes();
-    segmenterTypes.forEach(t -> {
-      handlers.put(t, new ImmutablePair<>(handlers.get(t).getLeft(), () -> segmenter));
-    });
+    if (segmenter != null) {
+      segmenterTypes = segmenter.getMediaTypes();
+      segmenterTypes.forEach(t -> {
+        handlers.put(t, new ImmutablePair<>(handlers.get(t).getLeft(), () -> segmenter));
+      });
+    }
 
     //Config overwrite
     Config.sharedConfig().getDecoders().forEach((type, decoderConfig) -> {
@@ -151,7 +150,13 @@ public Set<MediaType> getMediaTypes() {
 
     //TODO Config should allow for multiple segmenters
 
-    this.handlers.forEach((key, value) -> handlerCache.put(key, ImmutablePair.of(value.getLeft().get(), value.getRight().get())));
+    this.handlers.forEach((key, value) -> {
+      try {
+        handlerCache.put(key, ImmutablePair.of(value.getLeft().get(), value.getRight().get()));
+      } catch (Exception e) {
+        //TODO
+      }
+    });
 
   }
 
diff --git a/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/runtime/ExtractionPipeline.java b/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/runtime/ExtractionPipeline.java
index 7a67c3322..19fe2c5df 100644
--- a/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/runtime/ExtractionPipeline.java
+++ b/cineast-runtime/src/main/java/org/vitrivr/cineast/standalone/runtime/ExtractionPipeline.java
@@ -1,13 +1,5 @@
 package org.vitrivr.cineast.standalone.runtime;
 
-import java.util.LinkedList;
-import java.util.List;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.LinkedBlockingQueue;
-import java.util.concurrent.RejectedExecutionException;
-import java.util.concurrent.ThreadPoolExecutor;
-import java.util.concurrent.TimeUnit;
 import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
@@ -19,6 +11,10 @@
 import org.vitrivr.cineast.standalone.config.ExtractionPipelineConfig;
 import org.vitrivr.cineast.standalone.monitoring.PrometheusExtractionTaskMonitor;
 
+import java.util.LinkedList;
+import java.util.List;
+import java.util.concurrent.*;
+
 
 public class ExtractionPipeline implements Runnable, ExecutionTimeCounter {
 
@@ -46,12 +42,12 @@ public class ExtractionPipeline implements Runnable, ExecutionTimeCounter {
   private final ExecutorService executorService;
 
   /**
-   * ExtractionContextProvider used to setup the Pipeline. It contains information about the Extractors.
+   * ExtractionContextProvider used to set up the Pipeline. It contains information about the Extractors.
    */
   private final ExtractionContextProvider context;
 
   /**
-   * Flag indicating whether or not the ExtractionPipeline is running.
+   * Flag indicating whether the ExtractionPipeline is running.
    */
   private volatile boolean running = false;
 
@@ -98,7 +94,7 @@ public synchronized void stop() {
   }
 
   /**
-   * Indicates whether or not the ExtractionPipeline is still running i.e. the while loop in the run() method is still being executed. Even if this flag is set to false, there might still be ExtractionTasks in the executor that have not finished yet!
+   * Indicates whether the ExtractionPipeline is still running i.e. the while loop in the run() method is still being executed. Even if this flag is set to false, there might still be ExtractionTasks in the executor that have not finished yet!
    *
    * @return True if the ExtractionPipeline is running, false otherwise.
    */

From 33f6e132636d00f2c8f39d3428f1cdbeb9956c73 Mon Sep 17 00:00:00 2001
From: Luca Rossetto <rossetto@ifi.uzh.ch>
Date: Fri, 6 Jan 2023 16:26:44 +0100
Subject: [PATCH 2/4] Added image scaling and audio decoding (untested) to
 FFMpegProcessVideoDecoder

---
 .../video/FFMpegProcessVideoDecoder.java      | 114 ++++++++++++------
 1 file changed, 80 insertions(+), 34 deletions(-)

diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
index 724af35d0..290cfe11d 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
@@ -4,11 +4,13 @@
 import com.github.kokorin.jaffree.ffmpeg.*;
 import com.github.kokorin.jaffree.ffprobe.FFprobe;
 import com.github.kokorin.jaffree.ffprobe.FFprobeResult;
+import net.coobird.thumbnailator.Thumbnails;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.vitrivr.cineast.core.config.CacheConfig;
 import org.vitrivr.cineast.core.config.DecoderConfig;
 import org.vitrivr.cineast.core.data.frames.AudioDescriptor;
+import org.vitrivr.cineast.core.data.frames.AudioFrame;
 import org.vitrivr.cineast.core.data.frames.VideoDescriptor;
 import org.vitrivr.cineast.core.data.frames.VideoFrame;
 import org.vitrivr.cineast.core.data.raw.CachedDataFactory;
@@ -16,12 +18,16 @@
 import org.vitrivr.cineast.core.extraction.decode.general.Decoder;
 
 import java.awt.image.BufferedImage;
+import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+
 
 public class FFMpegProcessVideoDecoder implements Decoder<VideoFrame> {
 
@@ -39,34 +45,12 @@ public class FFMpegProcessVideoDecoder implements Decoder<VideoFrame> {
      * Configuration property name for the {@link FFMpegVideoDecoder}: max height of the converted video.
      */
     private static final String CONFIG_HEIGHT_PROPERTY = "maxFrameHeight";
-    /**
-     * Configuration property name for the {@link FFMpegVideoDecoder}: number of channels of the converted audio. If <= 0, then no audio will be decoded.
-     */
-    private static final String CONFIG_CHANNELS_PROPERTY = "channels";
-    /**
-     * Configuration property name for the {@link FFMpegVideoDecoder}: samplerate of the converted audio.
-     */
-    private static final String CONFIG_SAMPLERATE_PROPERTY = "samplerate";
-    /**
-     * Configuration property name for the {@link FFMpegVideoDecoder}: Indicates whether subtitles should be decoded as well.
-     */
-    private static final String CONFIG_SUBTITLE_PROPERTY = "subtitles";
-    /**
-     * Configuration property default for the FFMpegVideoDecoder: max width of the converted video.
-     */
+
     private final static int CONFIG_MAXWIDTH_DEFAULT = 1920;
     /**
      * Configuration property default for the FFMpegVideoDecoder: max height of the converted video.
      */
     private final static int CONFIG_MAXHEIGHT_DEFAULT = 1080;
-    /**
-     * Configuration property default for the FFMpegVideoDecoder: number of channels of the converted audio.
-     */
-    private static final int CONFIG_CHANNELS_DEFAULT = 1;
-    /**
-     * Configuration property default for the FFMpegVideoDecoder: sample rate of the converted audio
-     */
-    private static final int CONFIG_SAMPLERATE_DEFAULT = 44100;
 
     private final Path ffmpegPath = Path.of("ffmpeg");
 
@@ -77,6 +61,7 @@ public class FFMpegProcessVideoDecoder implements Decoder<VideoFrame> {
     private CachedDataFactory factory;
 
     private final LinkedBlockingQueue<VideoFrame> videoFrameQueue = new LinkedBlockingQueue<>(10);
+    private final ConcurrentLinkedQueue<AudioFrame> audioFrameQueue = new ConcurrentLinkedQueue<>();
 
     @Override
     public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheConfig) {
@@ -100,7 +85,7 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
         VideoDescriptor videoDescriptor = null;
         final HashMap<Integer, AudioDescriptor> audioDescriptors = new HashMap<>();
 
-        for (com.github.kokorin.jaffree.ffprobe.Stream stream: ffprobeResult.getStreams()) {
+        for (com.github.kokorin.jaffree.ffprobe.Stream stream : ffprobeResult.getStreams()) {
             if (stream.getCodecType() == StreamType.VIDEO) {
                 videoDescriptor = new VideoDescriptor(stream.getAvgFrameRate().floatValue(), Math.round(stream.getDuration() * 1000d), stream.getWidth(), stream.getHeight());
                 if (stream.getNbFrames() != null) {
@@ -119,8 +104,8 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
             return false;
         }
 
-        final int maxWidth = decoderConfig.namedAsInt(CONFIG_MAXWIDTH_PROPERTY, CONFIG_MAXWIDTH_DEFAULT);
-        final int maxHeight = decoderConfig.namedAsInt(CONFIG_HEIGHT_PROPERTY, CONFIG_MAXHEIGHT_DEFAULT);
+        final float maxWidth = decoderConfig.namedAsInt(CONFIG_MAXWIDTH_PROPERTY, CONFIG_MAXWIDTH_DEFAULT);
+        final float maxHeight = decoderConfig.namedAsInt(CONFIG_HEIGHT_PROPERTY, CONFIG_MAXHEIGHT_DEFAULT);
 
         VideoDescriptor finalVideoDescriptor = videoDescriptor;
         future = FFmpeg.atPath(ffmpegPath)
@@ -131,10 +116,15 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
                             final HashMap<Integer, Stream> streamHashMap = new HashMap<>();
                             int frameCounter = 0;
 
+                            final HashMap<Integer, AtomicInteger> audioFrameIdCounter = new HashMap<>();
+
                             @Override
                             public void consumeStreams(List<Stream> streams) {
                                 for (Stream stream : streams) {
                                     streamHashMap.put(stream.getId(), stream);
+                                    if (stream.getType() == Stream.Type.AUDIO) {
+                                        audioFrameIdCounter.put(stream.getId(), new AtomicInteger());
+                                    }
                                 }
                             }
 
@@ -156,7 +146,12 @@ public void consume(Frame frame) {
                                         BufferedImage bimg = frame.getImage();
 
                                         if (bimg.getWidth() > maxWidth || bimg.getHeight() > maxHeight) {
-                                            //TODO rescale
+                                            double scale = Math.min(bimg.getWidth() / maxWidth, bimg.getHeight() / maxHeight);
+                                            try {
+                                                bimg = Thumbnails.of(bimg).scale(scale).asBufferedImage();
+                                            } catch (IOException e) {
+                                                LOGGER.error("Could not scale frame", e);
+                                            }
                                         }
 
                                         MultiImage image = factory.newMultiImage(bimg);
@@ -168,23 +163,55 @@ public void consume(Frame frame) {
                                             LOGGER.error("Could not enqueue frame", e);
                                         }
 
-                                        break;
                                     }
                                     case AUDIO -> {
 
-                                        //TODO audio data conversion
+                                        AudioDescriptor descriptor = audioDescriptors.get(stream.getId());
+
+                                        if (descriptor == null) {
+                                            LOGGER.debug("received audio frame from unknown stream {}, ignoring", frame.getStreamId());
+                                            return;
+                                        }
+
+                                        int[] samples = frame.getSamples();
+
+                                        if (samples == null) {
+                                            return;
+                                        }
+
+                                        AtomicInteger idCounter = audioFrameIdCounter.get(stream.getId());
+
+                                        if (idCounter == null) {
+                                            return;
+                                        }
+
+                                        byte[] reEncoded = new byte[samples.length * 2];
+
+                                        for (int i = 0; i < samples.length; ++i) {
+
+                                            short s = (short) (samples[i] / 65536);
+                                            reEncoded[2*i] = ((byte) ((s) & 0xff));
+                                            reEncoded[2*i + 1] = ((byte) ((s >> 8) & 0xff));
+
+                                        }
+
+                                        AudioFrame audioFrame = new AudioFrame(
+                                                idCounter.getAndIncrement(),
+                                                (1000 * frame.getPts()) / stream.getTimebase(),
+                                                reEncoded,
+                                                descriptor
+                                        );
+
+                                        audioFrameQueue.add(audioFrame);
 
                                         break;
                                     }
                                 }
-
-
                             }
                         }
                 ))
                 .executeAsync();
 
-
         return true;
     }
 
@@ -204,7 +231,22 @@ public VideoFrame getNext() {
             return null;
         }
         try {
-            return videoFrameQueue.take();
+            VideoFrame frame = videoFrameQueue.take();
+
+            while (!decoderComplete()) {
+                AudioFrame audioFrame = this.audioFrameQueue.peek();
+                if (audioFrame == null) {
+                    break;
+                }
+                if (audioFrame.getTimestamp() <= frame.getTimestamp()) {
+                    frame.addAudioFrame(this.audioFrameQueue.poll());
+                } else {
+                    break;
+                }
+            }
+
+
+            return frame;
         } catch (InterruptedException e) {
             return null;
         }
@@ -216,9 +258,13 @@ public int count() {
         return estimatedFrameCount;
     }
 
+    private boolean decoderComplete() {
+        return this.future == null || this.future.isDone() || this.future.isCancelled();
+    }
+
     @Override
     public boolean complete() {
-        return (this.future == null || this.future.isDone() || this.future.isCancelled()) && this.videoFrameQueue.isEmpty();
+        return this.videoFrameQueue.isEmpty() && decoderComplete();
     }
 
     @Override

From 9735cd6fa01b763f2b0bff4e7ef4b49ae585cacc Mon Sep 17 00:00:00 2001
From: Luca Rossetto <rossetto@ifi.uzh.ch>
Date: Tue, 24 Jan 2023 11:13:35 +0100
Subject: [PATCH 3/4] Initial version of FFMpegProcessVideoDecoder

Audio is not re-sampled, so it might come in a different format that expected, depending on source file
---
 build.gradle                                  | 67 +++++++++++++++++++
 .../video/FFMpegProcessVideoDecoder.java      | 24 +++----
 2 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/build.gradle b/build.gradle
index a6665342f..9206ef0eb 100644
--- a/build.gradle
+++ b/build.gradle
@@ -1,3 +1,5 @@
+import org.gradle.nativeplatform.platform.internal.DefaultNativePlatform
+
 plugins {
     id 'com.github.johnrengelman.shadow' version '7.1.2'
     id 'de.undercouch.download' version "5.0.5"
@@ -128,3 +130,68 @@ task generateOAS(type: Download) {
     src 'http://localhost:4567/openapi-specs'
     dest "${project.projectDir}/docs/openapi.json"
 }
+
+task downloadFFmpeg(type: Download) {
+    def f = new File("$buildDir/cache/ffmpeg.zip")
+    outputs.upToDateWhen {
+        return f.exists()
+    }
+
+    def os = ""
+    if (DefaultNativePlatform.currentOperatingSystem.isWindows()) {
+        os = "win"
+    } else if (DefaultNativePlatform.currentOperatingSystem.isMacOsX()) {
+        os = "osx"
+    } else if (DefaultNativePlatform.currentOperatingSystem.isLinux()) {
+        os = "linux"
+    }
+
+    src "https://github.com/vot/ffbinaries-prebuilt/releases/download/v4.2.1/ffmpeg-4.2.1-$os-64.zip"
+    dest f
+}
+
+task downloadFFprobe(type: Download) {
+    def f = new File("$buildDir/cache/ffprobe.zip")
+    outputs.upToDateWhen {
+        return f.exists()
+    }
+
+    def os = ""
+    if (DefaultNativePlatform.currentOperatingSystem.isWindows()) {
+        os = "win"
+    } else if (DefaultNativePlatform.currentOperatingSystem.isMacOsX()) {
+        os = "osx"
+    } else if (DefaultNativePlatform.currentOperatingSystem.isLinux()) {
+        os = "linux"
+    }
+
+    src "https://github.com/vot/ffbinaries-prebuilt/releases/download/v4.2.1/ffprobe-4.2.1-$os-64.zip"
+    dest f
+}
+
+task copyFFmpeg(type: Copy) {
+    dependsOn downloadFFmpeg
+    outputs.upToDateWhen {
+        return !fileTree("$buildDir/ext/ffmpeg").filter { it.isFile() && it.name.startsWith('ffmpeg') }.isEmpty()
+    }
+    from zipTree(downloadFFmpeg.dest)
+    into "$buildDir/ext/ffmpeg"
+    include '*ffmpeg*'
+}
+
+task copyFFprobe(type: Copy) {
+    dependsOn downloadFFprobe
+    outputs.upToDateWhen {
+        return !fileTree("$buildDir/ext/ffmpeg").filter { it.isFile() && it.name.startsWith('ffprobe') }.isEmpty()
+    }
+    from zipTree(downloadFFprobe.dest)
+    into "$buildDir/ext/ffmpeg"
+    include '*ffprobe*'
+}
+
+task setupFFMpeg(type: Copy) {
+    dependsOn downloadFFmpeg
+    dependsOn downloadFFprobe
+    dependsOn copyFFmpeg
+    dependsOn copyFFprobe
+}
\ No newline at end of file
diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
index 290cfe11d..8945ddfcd 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
@@ -24,7 +24,6 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Set;
-import java.util.concurrent.ConcurrentLinkedQueue;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.atomic.AtomicInteger;
 
@@ -61,7 +60,7 @@ public class FFMpegProcessVideoDecoder implements Decoder<VideoFrame> {
     private CachedDataFactory factory;
 
     private final LinkedBlockingQueue<VideoFrame> videoFrameQueue = new LinkedBlockingQueue<>(10);
-    private final ConcurrentLinkedQueue<AudioFrame> audioFrameQueue = new ConcurrentLinkedQueue<>();
+    private final LinkedBlockingQueue<AudioFrame> audioFrameQueue = new LinkedBlockingQueue<>(1000);
 
     @Override
     public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheConfig) {
@@ -73,7 +72,7 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
 
         /* Initialize MultiImageFactory using the ImageCacheConfig. */
         if (cacheConfig == null) {
-            LOGGER.error("You must provide a valid ImageCacheConfig when initializing the FFMpegVideoDecoder.");
+            LOGGER.error("You must provide a valid ImageCacheConfig when initializing the FFMpegProcessVideoDecoder.");
             return false;
         }
         this.factory = cacheConfig.sharedCachedDataFactory();
@@ -83,7 +82,7 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
         FFprobeResult ffprobeResult = FFprobe.atPath(ffmpegPath).setInput(path).setShowStreams(true).execute();
 
         VideoDescriptor videoDescriptor = null;
-        final HashMap<Integer, AudioDescriptor> audioDescriptors = new HashMap<>();
+        AudioDescriptor audioDescriptor = null;
 
         for (com.github.kokorin.jaffree.ffprobe.Stream stream : ffprobeResult.getStreams()) {
             if (stream.getCodecType() == StreamType.VIDEO) {
@@ -94,8 +93,7 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
                 continue;
             }
             if (stream.getCodecType() == StreamType.AUDIO) {
-                AudioDescriptor descriptor = new AudioDescriptor(stream.getSampleRate().floatValue(), stream.getChannels(), Math.round(stream.getDuration() * 1000d));
-                audioDescriptors.put(stream.getIndex(), descriptor);
+                audioDescriptor = new AudioDescriptor(stream.getSampleRate().floatValue(), stream.getChannels(), Math.round(stream.getDuration() * 1000d)); //TODO stream id mismatch between ffprobe and ffmpeg, figure out how to deal with multiple streams
             }
         }
 
@@ -108,6 +106,7 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
         final float maxHeight = decoderConfig.namedAsInt(CONFIG_HEIGHT_PROPERTY, CONFIG_MAXHEIGHT_DEFAULT);
 
         VideoDescriptor finalVideoDescriptor = videoDescriptor;
+        AudioDescriptor finalAudioDescriptor = audioDescriptor;
         future = FFmpeg.atPath(ffmpegPath)
                 .addInput(UrlInput.fromPath(path))
                 .addOutput(FrameOutput.withConsumer(
@@ -166,9 +165,7 @@ public void consume(Frame frame) {
                                     }
                                     case AUDIO -> {
 
-                                        AudioDescriptor descriptor = audioDescriptors.get(stream.getId());
-
-                                        if (descriptor == null) {
+                                        if (finalAudioDescriptor == null) {
                                             LOGGER.debug("received audio frame from unknown stream {}, ignoring", frame.getStreamId());
                                             return;
                                         }
@@ -199,12 +196,15 @@ public void consume(Frame frame) {
                                                 idCounter.getAndIncrement(),
                                                 (1000 * frame.getPts()) / stream.getTimebase(),
                                                 reEncoded,
-                                                descriptor
+                                                finalAudioDescriptor
                                         );
 
-                                        audioFrameQueue.add(audioFrame);
+                                        try {
+                                            audioFrameQueue.put(audioFrame);
+                                        } catch (InterruptedException e) {
+                                            LOGGER.error("Could not enqueue audio frame", e);
+                                        }
 
-                                        break;
                                     }
                                 }
                             }

From 4f1290f5d2153c70867de829b9e2b5f09b6e1367 Mon Sep 17 00:00:00 2001
From: Luca Rossetto <rossetto@ifi.uzh.ch>
Date: Tue, 31 Jan 2023 14:34:09 +0100
Subject: [PATCH 4/4] Moved image rescaling and audio resampling into ffmpeg
 process when using FFMpegProcessVideoDecoder

---
 .../video/FFMpegProcessVideoDecoder.java      | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
index 8945ddfcd..9ed55eda2 100644
--- a/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
+++ b/cineast-core/src/main/java/org/vitrivr/cineast/core/extraction/decode/video/FFMpegProcessVideoDecoder.java
@@ -105,9 +105,12 @@ public boolean init(Path path, DecoderConfig decoderConfig, CacheConfig cacheCon
         final float maxWidth = decoderConfig.namedAsInt(CONFIG_MAXWIDTH_PROPERTY, CONFIG_MAXWIDTH_DEFAULT);
         final float maxHeight = decoderConfig.namedAsInt(CONFIG_HEIGHT_PROPERTY, CONFIG_MAXHEIGHT_DEFAULT);
 
+        final int TARGET_AUDIO_SAMPLE_RATE = 44100;
+        final int TARGET_AUDIO_CHANNELS = 1;
+
         VideoDescriptor finalVideoDescriptor = videoDescriptor;
-        AudioDescriptor finalAudioDescriptor = audioDescriptor;
-        future = FFmpeg.atPath(ffmpegPath)
+        AudioDescriptor finalAudioDescriptor = (audioDescriptor == null) ? null : new AudioDescriptor(TARGET_AUDIO_SAMPLE_RATE, TARGET_AUDIO_CHANNELS, audioDescriptor.duration());
+        FFmpeg ffmpeg = FFmpeg.atPath(ffmpegPath)
                 .addInput(UrlInput.fromPath(path))
                 .addOutput(FrameOutput.withConsumer(
                         new FrameConsumer() {
@@ -209,8 +212,20 @@ public void consume(Frame frame) {
                                 }
                             }
                         }
-                ))
-                .executeAsync();
+                ));
+
+        if ( videoDescriptor.height() > maxHeight || videoDescriptor.width() > maxWidth ) {
+            float scale = Math.min(videoDescriptor.width() / maxWidth, videoDescriptor.height() / maxHeight);
+            int w = Math.round(videoDescriptor.width() * scale);
+            int h = Math.round(videoDescriptor.height() * scale);
+            ffmpeg = ffmpeg.setFilter(StreamType.VIDEO, "scale=" + w + ":" + h);
+        }
+
+        if ( finalAudioDescriptor != null ) {
+            ffmpeg = ffmpeg.addArguments("-ac", audioDescriptor.channels() + "").addArguments("-ar", audioDescriptor.samplingrate() + "");
+        }
+
+        future = ffmpeg.executeAsync();
 
         return true;
     }