Add ability to specify output parameters

elixir-webrtc · Aug 7, 2024 · 2a4f4bc · 2a4f4bc
1 parent dad836a
commit 2a4f4bc
Show file tree

Hide file tree

Showing 11 changed files with 258 additions and 43 deletions.
diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c
@@ -7,7 +7,7 @@ ErlNifResourceType *xav_decoder_resource_type;
 static int init_audio_converter(struct XavDecoder *xav_decoder);
 
 ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 1) {
+  if (argc != 4) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
@@ -22,10 +22,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
+  unsigned int out_format_len;
+  if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
+    return xav_nif_raise(env, "failed_to_get_atom_length");
+  }
+
+  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
+
+  if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
+    return xav_nif_raise(env, "failed_to_get_atom");
+  }
+
+  int out_sample_rate;
+  if (!enif_get_int(env, argv[2], &out_sample_rate)) {
+    return xav_nif_raise(env, "invalid_out_sample_rate");
+  }
+
+  int out_channels;
+  if (!enif_get_int(env, argv[3], &out_channels)) {
+    return xav_nif_raise(env, "invalid_out_channels");
+  }
+
   struct XavDecoder *xav_decoder =
       enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder));
   xav_decoder->decoder = NULL;
   xav_decoder->ac = NULL;
+  xav_decoder->out_format = out_format;
+  xav_decoder->out_sample_rate = out_sample_rate;
+  xav_decoder->out_channels = out_channels;
 
   xav_decoder->decoder = decoder_alloc();
   if (xav_decoder->decoder == NULL) {
@@ -120,6 +144,12 @@ ERL_NIF_TERM decode(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
 
     const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt);
 
+    if (strcmp(out_format, "flt") == 0) {
+      out_format = "f32";
+    } else if (strcmp(out_format, "dbl") == 0) {
+      out_format = "f64";
+    }
+
     frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
                                              xav_decoder->decoder->frame->pts);
 
@@ -139,16 +169,47 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) {
     return -1;
   }
 
-  int out_sample_rate = xav_decoder->decoder->c->sample_rate;
-  enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  int out_sample_rate;
+  if (xav_decoder->out_sample_rate == 0) {
+    out_sample_rate = xav_decoder->decoder->c->sample_rate;
+  } else {
+    out_sample_rate = xav_decoder->out_sample_rate;
+  }
+
+  enum AVSampleFormat out_sample_fmt;
+  if (strcmp(xav_decoder->out_format, "u8") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_U8;
+  } else if (strcmp(xav_decoder->out_format, "s16") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S16;
+  } else if (strcmp(xav_decoder->out_format, "s32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S32;
+  } else if (strcmp(xav_decoder->out_format, "s64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S64;
+  } else if (strcmp(xav_decoder->out_format, "f32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  } else if (strcmp(xav_decoder->out_format, "f64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_DBL;
+  } else if (strcmp(xav_decoder->out_format, "nil") == 0) {
+    out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0);
+  } else {
+    return -1;
+  }
 
   struct ChannelLayout in_chlayout, out_chlayout;
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   in_chlayout.layout = xav_decoder->decoder->c->ch_layout;
-  out_chlayout.layout = xav_decoder->decoder->c->ch_layout;
+  if (xav_decoder->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    av_channel_layout_default(&out_chlayout.layout, xav_decoder->out_channels);
+  }
 #else
   in_chlayout.layout = xav_decoder->decoder->c->channel_layout;
-  out_chlayout.layout = xav_decoder->decoder->c->channel_layout;
+  if (xav_decoder->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    out_chlayout.layout = av_get_default_channel_layout(xav_decoder->out_channels);
+  }
 #endif
 
   return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate,
@@ -168,7 +229,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 1, new},
+static ErlNifFunc xav_funcs[] = {{"new", 4, new},
                                  {"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
 static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {

diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h
@@ -4,4 +4,7 @@
 struct XavDecoder {
   struct Decoder *decoder;
   struct AudioConverter *ac;
+  char *out_format;
+  int out_sample_rate;
+  int out_channels;
 };
diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c
@@ -5,7 +5,7 @@ static int init_audio_converter(struct XavReader *xav_reader);
 ErlNifResourceType *xav_reader_resource_type;
 
 ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 3) {
+  if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
@@ -31,10 +31,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     media_type = AVMEDIA_TYPE_AUDIO;
   }
 
+  unsigned int out_format_len;
+  if (!enif_get_atom_length(env, argv[3], &out_format_len, ERL_NIF_LATIN1)) {
+    return xav_nif_raise(env, "failed_to_get_atom_length");
+  }
+
+  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
+
+  if (enif_get_atom(env, argv[3], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
+    return xav_nif_raise(env, "failed_to_get_atom");
+  }
+
+  int out_sample_rate;
+  if (!enif_get_int(env, argv[4], &out_sample_rate)) {
+    return xav_nif_raise(env, "invalid_out_sample_rate");
+  }
+
+  int out_channels;
+  if (!enif_get_int(env, argv[5], &out_channels)) {
+    return xav_nif_raise(env, "invalid_out_channels");
+  }
+
   struct XavReader *xav_reader =
       enif_alloc_resource(xav_reader_resource_type, sizeof(struct XavReader));
   xav_reader->reader = NULL;
   xav_reader->ac = NULL;
+  xav_reader->out_format = out_format;
+  xav_reader->out_sample_rate = out_sample_rate;
+  xav_reader->out_channels = out_channels;
 
   xav_reader->reader = reader_alloc();
   if (xav_reader->reader == NULL) {
@@ -133,6 +157,12 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
 
     const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt);
 
+    if (strcmp(out_format, "flt") == 0) {
+      out_format = "f32";
+    } else if (strcmp(out_format, "dbl") == 0) {
+      out_format = "f64";
+    }
+
     frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
                                              xav_reader->reader->frame->pts);
     av_freep(&out_data[0]);
@@ -151,16 +181,42 @@ static int init_audio_converter(struct XavReader *xav_reader) {
     return -1;
   }
 
-  int out_sample_rate = 16000;
-  enum AVSampleFormat out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  int out_sample_rate;
+  if (xav_reader->out_sample_rate == 0) {
+    out_sample_rate = xav_reader->reader->c->sample_rate;
+  } else {
+    out_sample_rate = xav_reader->out_sample_rate;
+  }
+
+  enum AVSampleFormat out_sample_fmt;
+  if (strcmp(xav_reader->out_format, "u8") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_U8;
+  } else if (strcmp(xav_reader->out_format, "s16") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S16;
+  } else if (strcmp(xav_reader->out_format, "s32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S32;
+  } else if (strcmp(xav_reader->out_format, "s64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_S64;
+  } else if (strcmp(xav_reader->out_format, "f32") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_FLT;
+  } else if (strcmp(xav_reader->out_format, "f64") == 0) {
+    out_sample_fmt = AV_SAMPLE_FMT_DBL;
+  } else if (strcmp(xav_reader->out_format, "nil") == 0) {
+    out_sample_fmt = av_get_alt_sample_fmt(xav_reader->reader->c->sample_fmt, 0);
+  } else {
+    return -1;
+  }
 
   struct ChannelLayout in_chlayout, out_chlayout;
 #if LIBAVUTIL_VERSION_MAJOR >= 58
   in_chlayout.layout = xav_reader->reader->c->ch_layout;
-  av_channel_layout_from_mask(&out_chlayout.layout, AV_CH_LAYOUT_MONO);
+  if (xav_reader->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    av_channel_layout_default(&out_chlayout.layout, xav_reader->out_channels);
+  }
 #else
   in_chlayout.layout = xav_reader->reader->c->channel_layout;
-  out_chlayout.layout = AV_CH_LAYOUT_MONO;
 
   if (xav_reader->reader->c->channel_layout == 0 && xav_reader->reader->c->channels > 0) {
     // In newer FFmpeg versions, 0 means that the order of channels is
@@ -176,6 +232,11 @@ static int init_audio_converter(struct XavReader *xav_reader) {
     return -1;
   }
 
+  if (xav_reader->out_channels == 0) {
+    out_chlayout.layout = in_chlayout.layout;
+  } else {
+    out_chlayout.layout = av_get_default_channel_layout(xav_reader->out_channels);
+  }
 #endif
 
   return audio_converter_init(xav_reader->ac, in_chlayout, xav_reader->reader->c->sample_rate,
@@ -195,7 +256,7 @@ void free_xav_reader(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 3, new},
+static ErlNifFunc xav_funcs[] = {{"new", 6, new},
                                  {"next_frame", 1, next_frame, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
 static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {

diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h
@@ -4,4 +4,7 @@
 struct XavReader {
   struct Reader *reader;
   struct AudioConverter *ac;
+  char *out_format;
+  int out_sample_rate;
+  int out_channels;
 };
diff --git a/lib/decoder.ex b/lib/decoder.ex
@@ -10,19 +10,59 @@ defmodule Xav.Decoder do
 
   @type t() :: reference()
 
+  @typedoc """
+  Opts that can be passed to `new/2`.
+  """
+  @type opts :: [
+          out_format: Xav.Frame.format(),
+          out_sample_rate: integer(),
+          out_channels: integer()
+        ]
+
   @doc """
   Creates a new decoder.
+
+  `opts` can be used to specify desired output parameters.
+
+  E.g. if you want to change audio samples format just pass:
+
+  ```elixir
+  [out_format: :f32]
+  ```
+
+  Video frames are always returned in RGB format.
+  This setting cannot be changed.
+
+  Audio samples are always in the packed form -
+  samples from different channels are interleaved in the same, single binary:
+
+  ```
+  <<c10, c20, c30, c11, c21, c31, c12, c22, c32>>
+  ```
+
+  The way in which samples are interleaved is not specified.
+
+  An alternative would be to return a list of binaries, where
+  each binary represents different channel:
+
+  ```
+  [
+    <<c10, c11, c12, c13, c14>>,
+    <<c20, c21, c22, c23, c24>>,
+    <<c30, c31, c32, c33, c34>>
+  ]
+  ```
   """
-  @spec new(codec()) :: t()
-  def new(codec) do
-    Xav.Decoder.NIF.new(codec)
+  @spec new(codec(), opts()) :: t()
+  def new(codec, opts \\ []) do
+    out_format = opts[:out_format]
+    out_sample_rate = opts[:out_sample_rate] || 0
+    out_channels = opts[:out_channels] || 0
+    Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels)
   end
 
   @doc """
-  Decodes an audio or video frame.
-
-  Video frames are always in the RGB format.
-  Audio samples are always interleaved.
+  Decodes an audio/video frame.
   """
   @spec decode(t(), binary(), pts: integer(), dts: integer()) ::
           {:ok, Xav.Frame.t()} | {:error, atom()}

diff --git a/lib/decoder_nif.ex b/lib/decoder_nif.ex
@@ -8,7 +8,7 @@ defmodule Xav.Decoder.NIF do
     :ok = :erlang.load_nif(path, 0)
   end
 
-  def new(_codec), do: :erlang.nif_error(:undef)
+  def new(_codec, _out_format, _out_sample_rate, _out_channels), do: :erlang.nif_error(:undef)
 
   def decode(_decoder, _data, _pts, _dts), do: :erlang.nif_error(:undef)
 end