Add audio and video converters. Fix audio resampling. (#8)

elixir-webrtc · Aug 5, 2024 · dad836a · dad836a
1 parent 407330e
commit dad836a
Show file tree

Hide file tree

Showing 27 changed files with 795 additions and 496 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -77,6 +77,11 @@ jobs:
     name: test-macos-x86-64 / macos-13 / OTP latest / Elixir latest
     env:
       MIX_ENV: test
+      # MacOS runners seem to have static IP addresses
+      # which results in GitHub rate limiting our requests
+      # for downloading prebuilt XLA binaries.
+      # Adding token seems to help.
+      XLA_HTTP_HEADERS: "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}"
     steps:
       - uses: actions/checkout@v2
       - run: brew install ffmpeg elixir
@@ -88,6 +93,7 @@ jobs:
     name: test-macos-arm / macos-14 / OTP latest / Elixir latest
     env:
       MIX_ENV: test
+      XLA_HTTP_HEADERS: "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}"
     steps:
       - uses: actions/checkout@v2
       - run: brew install ffmpeg elixir

diff --git a/Makefile b/Makefile
@@ -5,13 +5,17 @@
 
 XAV_DIR = c_src/xav
 PRIV_DIR = $(MIX_APP_PATH)/priv
-XAV_SO = $(PRIV_DIR)/libxav.so
+XAV_DECODER_SO = $(PRIV_DIR)/libxavdecoder.so
+XAV_READER_SO = $(PRIV_DIR)/libxavreader.so
 
 # uncomment to compile with debug logs
 # XAV_DEBUG_LOGS = -DXAV_DEBUG=1
 
-HEADERS = $(XAV_DIR)/reader.h $(XAV_DIR)/decoder.h $(XAV_DIR)/utils.h
-SOURCES = $(XAV_DIR)/xav_nif.c $(XAV_DIR)/reader.c $(XAV_DIR)/decoder.c $(XAV_DIR)/utils.c
+DECODER_HEADERS = $(XAV_DIR)/xav_decoder.h $(XAV_DIR)/decoder.h $(XAV_DIR)/video_converter.h $(XAV_DIR)/audio_converter.h $(XAV_DIR)/utils.h $(XAV_DIR)/channel_layout.h
+DECODER_SOURCES = $(XAV_DIR)/xav_decoder.c $(XAV_DIR)/decoder.c $(XAV_DIR)/video_converter.c $(XAV_DIR)/audio_converter.c $(XAV_DIR)/utils.c
+
+READER_HEADERS = $(XAV_DIR)/xav_reader.h $(XAV_DIR)/reader.h $(XAV_DIR)/video_converter.h $(XAV_DIR)/audio_converter.h $(XAV_DIR)/utils.h $(XAV_DIR)/channel_layout.h
+READER_SOURCES = $(XAV_DIR)/xav_reader.c $(XAV_DIR)/reader.c $(XAV_DIR)/video_converter.c $(XAV_DIR)/audio_converter.c $(XAV_DIR)/utils.c
 
 CFLAGS = $(XAV_DEBUG_LOGS) -fPIC -shared
 IFLAGS = -I$(ERTS_INCLUDE_DIR) -I$(XAV_DIR)
@@ -27,9 +31,15 @@ ifeq ($(shell uname -s),Darwin)
 	endif
 endif
 
-$(XAV_SO): Makefile $(SOURCES) $(HEADERS)
+all: $(XAV_DECODER_SO) $(XAV_READER_SO)
+
+$(XAV_DECODER_SO): Makefile $(DECODER_SOURCES) $(DECODER_HEADERS)
+	mkdir -p $(PRIV_DIR)
+	$(CC) $(CFLAGS) $(IFLAGS) $(LFLAGS) $(DECODER_SOURCES) -o $(XAV_DECODER_SO) $(LDFLAGS)
+
+$(XAV_READER_SO): Makefile $(READER_SOURCES) $(READER_HEADERS)
 	mkdir -p $(PRIV_DIR)
-	$(CC) $(CFLAGS) $(IFLAGS) $(LFLAGS) $(SOURCES) -o $(XAV_SO) $(LDFLAGS)
+	$(CC) $(CFLAGS) $(IFLAGS) $(LFLAGS) $(READER_SOURCES) -o $(XAV_READER_SO) $(LDFLAGS)
 
 format:
 	clang-format -i $(XAV_DIR)/*

diff --git a/c_src/xav/audio_converter.c b/c_src/xav/audio_converter.c
@@ -0,0 +1,99 @@
+#include <libavutil/channel_layout.h>
+#include <libavutil/opt.h>
+#include <libavutil/samplefmt.h>
+#include <libswresample/swresample.h>
+#include <stdint.h>
+
+#include "audio_converter.h"
+#include "channel_layout.h"
+#include "utils.h"
+
+struct AudioConverter *audio_converter_alloc() {
+  struct AudioConverter *converter =
+      (struct AudioConverter *)XAV_ALLOC(sizeof(struct AudioConverter));
+  converter->swr_ctx = NULL;
+  return converter;
+}
+
+int audio_converter_init(struct AudioConverter *c, struct ChannelLayout in_chlayout,
+                         int in_sample_rate, enum AVSampleFormat in_sample_fmt,
+                         struct ChannelLayout out_chlayout, int out_sample_rate,
+                         enum AVSampleFormat out_sample_fmt) {
+  c->swr_ctx = swr_alloc();
+  c->in_sample_rate = in_sample_rate;
+  c->out_sample_rate = out_sample_rate;
+  c->out_chlayout = out_chlayout;
+  c->out_sample_fmt = out_sample_fmt;
+
+#if LIBAVUTIL_VERSION_MAJOR >= 58
+  av_opt_set_chlayout(c->swr_ctx, "in_chlayout", &in_chlayout.layout, 0);
+  av_opt_set_chlayout(c->swr_ctx, "out_chlayout", &out_chlayout.layout, 0);
+#else
+  av_opt_set_channel_layout(c->swr_ctx, "in_channel_layout", in_chlayout.layout, 0);
+  av_opt_set_channel_layout(c->swr_ctx, "out_channel_layout", out_chlayout.layout, 0);
+#endif
+
+  av_opt_set_int(c->swr_ctx, "in_sample_rate", in_sample_rate, 0);
+  av_opt_set_int(c->swr_ctx, "out_sample_rate", out_sample_rate, 0);
+
+  av_opt_set_sample_fmt(c->swr_ctx, "in_sample_fmt", in_sample_fmt, 0);
+  av_opt_set_sample_fmt(c->swr_ctx, "out_sample_fmt", out_sample_fmt, 0);
+
+  return swr_init(c->swr_ctx);
+}
+
+int audio_converter_convert(struct AudioConverter *c, AVFrame *src_frame, uint8_t ***out_data,
+                            int *out_samples, int *out_size) {
+
+#if LIBAVUTIL_VERSION_MAJOR >= 58
+  int out_nb_channels = c->out_chlayout.layout.nb_channels;
+#else
+  int out_nb_channels = av_get_channel_layout_nb_channels(c->out_chlayout.layout);
+#endif
+
+  uint8_t **out_data_tmp = NULL;
+  int max_out_nb_samples = swr_get_out_samples(c->swr_ctx, src_frame->nb_samples);
+  int out_bytes_per_sample = av_get_bytes_per_sample(c->out_sample_fmt);
+
+  // Some parts of ffmpeg require buffers to by divisible by 32
+  // to use fast/aligned SIMD routines - this is what align option is used for.
+  // See https://stackoverflow.com/questions/35678041/what-is-linesize-alignment-meaning
+  // Because we return the binary straight to the Erlang, we can disable it.
+  int ret = av_samples_alloc_array_and_samples(&out_data_tmp, NULL, out_nb_channels,
+                                               max_out_nb_samples, c->out_sample_fmt, 1);
+
+  if (ret < 0) {
+    XAV_LOG_DEBUG("Couldn't allocate array for out samples.");
+    return ret;
+  }
+
+  *out_samples = swr_convert(c->swr_ctx, out_data_tmp, max_out_nb_samples,
+                             (const uint8_t **)src_frame->data, src_frame->nb_samples);
+
+  if (*out_samples < 0) {
+    XAV_LOG_DEBUG("Couldn't convert samples: %d", *out_samples);
+    av_freep(&out_data_tmp[0]);
+    return -1;
+  }
+
+  XAV_LOG_DEBUG("Converted %d samples per channel", *out_samples);
+
+  *out_size = *out_samples * out_bytes_per_sample * out_nb_channels;
+
+  *out_data = out_data_tmp;
+
+  return 0;
+}
+
+void audio_converter_free(struct AudioConverter **converter) {
+  if (*converter != NULL) {
+    struct AudioConverter *c = *converter;
+
+    if (c->swr_ctx != NULL) {
+      swr_free(&c->swr_ctx);
+    }
+
+    XAV_FREE(c);
+    *converter = NULL;
+  }
+}
diff --git a/c_src/xav/audio_converter.h b/c_src/xav/audio_converter.h
@@ -0,0 +1,41 @@
+#ifndef CONVERTER_H
+#define CONVERTER_H
+#include <libavutil/channel_layout.h>
+#include <libswresample/swresample.h>
+#include <stdint.h>
+
+#include "channel_layout.h"
+
+struct AudioConverter {
+  SwrContext *swr_ctx;
+  int64_t in_sample_rate;
+  int64_t out_sample_rate;
+  struct ChannelLayout out_chlayout;
+  enum AVSampleFormat out_sample_fmt;
+};
+
+struct AudioConverter *audio_converter_alloc(void);
+
+int audio_converter_init(struct AudioConverter *c, struct ChannelLayout in_chlayout,
+                         int in_sample_rate, enum AVSampleFormat in_sample_fmt,
+                         struct ChannelLayout out_chlayout, int out_sample_rate,
+                         enum AVSampleFormat out_sample_fmt);
+
+/**
+ * Converts AVFrame to the output format.
+ *
+ * @param c audio converter
+ * @param src_frame decoded source frame
+ * @param out_data buffer where audio samples are written after convertion.
+ * We always convert to the packed format, so only *out_data[0] is set.
+ * It will be initialized internally and has to be freed with av_freep(&(*out_data[0])).
+ * @param out_samples number of samples per channel in out_data buffer.
+ * @param out_size size of out_buffer in bytes.
+ * This is the same as *out_samples * bytes_per_sample(out_format) * out_channels
+ * @return  0 on success and negative value on error.
+ */
+int audio_converter_convert(struct AudioConverter *c, AVFrame *src_frame, uint8_t ***out_data,
+                            int *out_samples, int *out_size);
+
+void audio_converter_free(struct AudioConverter **converter);
+#endif
diff --git a/c_src/xav/channel_layout.h b/c_src/xav/channel_layout.h
@@ -0,0 +1,12 @@
+#ifndef CHANNEL_LAYOUT_H
+#define CHANNEL_LAYOUT_H
+#include <libavutil/channel_layout.h>
+
+struct ChannelLayout {
+#if LIBAVUTIL_VERSION_MAJOR >= 58
+  AVChannelLayout layout;
+#else
+  uint64_t layout;
+#endif
+};
+#endif
diff --git a/c_src/xav/decoder.c b/c_src/xav/decoder.c
@@ -1,18 +1,25 @@
 #include "decoder.h"
 #include "utils.h"
+#include "video_converter.h"
 
-int decoder_init(struct Decoder *decoder, const char *codec) {
-  decoder->swr_ctx = NULL;
+static int init_converter(struct Decoder *decoder);
+
+struct Decoder *decoder_alloc() {
+  struct Decoder *decoder = (struct Decoder *)XAV_ALLOC(sizeof(struct Decoder));
 
+  decoder->codec = NULL;
+  decoder->c = NULL;
+
+  return decoder;
+}
+
+int decoder_init(struct Decoder *decoder, const char *codec) {
   if (strcmp(codec, "opus") == 0) {
     decoder->media_type = AVMEDIA_TYPE_AUDIO;
     decoder->codec = avcodec_find_decoder(AV_CODEC_ID_OPUS);
-    // we will initialize out_format_name with the first frame
-    decoder->out_format_name = NULL;
   } else if (strcmp(codec, "vp8") == 0) {
     decoder->media_type = AVMEDIA_TYPE_VIDEO;
     decoder->codec = avcodec_find_decoder(AV_CODEC_ID_VP8);
-    decoder->out_format_name = "rgb";
   } else {
     return -1;
   }
@@ -26,6 +33,16 @@ int decoder_init(struct Decoder *decoder, const char *codec) {
     return -1;
   }
 
+  decoder->frame = av_frame_alloc();
+  if (!decoder->frame) {
+    return -1;
+  }
+
+  decoder->pkt = av_packet_alloc();
+  if (!decoder->pkt) {
+    return -1;
+  }
+
   if (avcodec_open2(decoder->c, decoder->codec, NULL) < 0) {
     return -1;
   }
@@ -40,53 +57,33 @@ int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame) {
     return -2;
   }
 
-  ret = avcodec_receive_frame(decoder->c, frame);
-  if (ret != 0) {
-    return -1;
-  }
+  return avcodec_receive_frame(decoder->c, frame);
+}
 
-  if (decoder->media_type == AVMEDIA_TYPE_AUDIO && decoder->out_format_name == NULL) {
-    enum AVSampleFormat out_sample_fmt = av_get_alt_sample_fmt(frame->format, 0);
-    decoder->out_format_name = av_get_sample_fmt_name(out_sample_fmt);
-  }
+void decoder_free_frame(struct Decoder *decoder) {
+  // TODO revisit this
+  av_frame_unref(decoder->frame);
+  av_packet_unref(decoder->pkt);
+}
 
-  if (decoder->media_type == AVMEDIA_TYPE_VIDEO) {
-    if (frame->format != AV_PIX_FMT_RGB24) {
-      convert_to_rgb(frame, decoder->rgb_dst_data, decoder->rgb_dst_linesize);
-      decoder->frame_data = decoder->rgb_dst_data;
-      decoder->frame_linesize = decoder->rgb_dst_linesize;
-    } else {
-      decoder->frame_data = frame->data;
-      decoder->frame_linesize = frame->linesize;
+void decoder_free(struct Decoder **decoder) {
+  XAV_LOG_DEBUG("Freeing Decoder object");
+  if (*decoder != NULL) {
+    struct Decoder *d = *decoder;
+
+    if (d->c != NULL) {
+      avcodec_free_context(&d->c);
     }
-  } else if (decoder->media_type == AVMEDIA_TYPE_AUDIO &&
-             av_sample_fmt_is_planar(frame->format) == 1) {
-    if (decoder->swr_ctx == NULL) {
-      if (init_swr_ctx_from_frame(&decoder->swr_ctx, frame) != 0) {
-        return -1;
-      }
+
+    if (d->pkt != NULL) {
+      av_packet_free(&d->pkt);
     }
 
-    if (convert_to_interleaved(decoder->swr_ctx, frame, decoder->rgb_dst_data,
-                               decoder->rgb_dst_linesize) != 0) {
-      return -1;
+    if (d->frame != NULL) {
+      av_frame_free(&d->frame);
     }
 
-    decoder->frame_data = decoder->rgb_dst_data;
-    decoder->frame_linesize = decoder->rgb_dst_linesize;
-  } else {
-    decoder->frame_data = frame->extended_data;
+    XAV_FREE(d);
+    *decoder = NULL;
   }
-
-  return 0;
 }
-
-void decoder_free(struct Decoder *decoder) {
-  if (decoder->swr_ctx != NULL) {
-    swr_free(&decoder->swr_ctx);
-  }
-
-  if (decoder->c != NULL) {
-    avcodec_free_context(&decoder->c);
-  }
-}
diff --git a/c_src/xav/decoder.h b/c_src/xav/decoder.h
@@ -1,23 +1,23 @@
 #include <libavcodec/avcodec.h>
 #include <libswresample/swresample.h>
 
+#include "audio_converter.h"
+#include "utils.h"
+
 struct Decoder {
   enum AVMediaType media_type;
+  AVFrame *frame;
+  AVPacket *pkt;
   const AVCodec *codec;
   AVCodecContext *c;
-  SwrContext *swr_ctx;
-
-  const char *out_format_name;
-
-  uint8_t *rgb_dst_data[4];
-  int rgb_dst_linesize[4];
-
-  uint8_t **frame_data;
-  int *frame_linesize;
 };
 
+struct Decoder *decoder_alloc();
+
 int decoder_init(struct Decoder *decoder, const char *codec);
 
 int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame);
 
-void decoder_free(struct Decoder *decoder);
+void decoder_free_frame(struct Decoder *decoder);
+
+void decoder_free(struct Decoder **decoder);