Refactor out format resolving.

This commit moves out format resolving to the xav_decoder.c. This way we can remove out_format from decoder struct.
elixir-webrtc · Jan 3, 2025 · 49848fd · 49848fd
1 parent 13fc3b0
commit 49848fd
Show file tree

Hide file tree

Showing 12 changed files with 91 additions and 87 deletions.
diff --git a/README.md b/README.md
@@ -29,7 +29,7 @@ end
 Decode
 
 ```elixir
-decoder = Xav.Decoder.new(:vp8)
+decoder = Xav.Decoder.new(:vp8, out_format: :rgb24)
 {:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>)
 ```
 
@@ -52,7 +52,7 @@ Kino.Image.new(tensor)
 Read from a camera:
 
 ```elixir
-r = Xav.Reader.new!("/dev/video0", device?: true)
+r = Xav.Reader.new!("/dev/video0", device?: true, out_format: :rgb24)
 {:ok, %Xav.Frame{} = frame} = Xav.Reader.next_frame(r)
 tensor = Xav.Frame.to_nx(frame)
 Kino.Image.new(tensor)

diff --git a/c_src/xav/decoder.c b/c_src/xav/decoder.c
@@ -9,39 +9,18 @@ struct Decoder *decoder_alloc() {
 
   decoder->codec = NULL;
   decoder->c = NULL;
-  decoder->out_format = AV_PIX_FMT_NONE;
 
   return decoder;
 }
 
-int decoder_init(struct Decoder *decoder, const char *codec, const char* out_format) {
-  if (strcmp(codec, "opus") == 0) {
-    decoder->media_type = AVMEDIA_TYPE_AUDIO;
-    decoder->codec = avcodec_find_decoder(AV_CODEC_ID_OPUS);
-  } else if (strcmp(codec, "vp8") == 0) {
-    decoder->media_type = AVMEDIA_TYPE_VIDEO;
-    decoder->codec = avcodec_find_decoder(AV_CODEC_ID_VP8);
-  } else if (strcmp(codec, "h264") == 0) {
-    decoder->media_type = AVMEDIA_TYPE_VIDEO;
-    decoder->codec = avcodec_find_decoder(AV_CODEC_ID_H264);
-  } else if (strcmp(codec, "h265") == 0) {
-    decoder->media_type = AVMEDIA_TYPE_VIDEO;
-    decoder->codec = avcodec_find_decoder(AV_CODEC_ID_HEVC);
-  } else {
-    return -1;
-  }
+int decoder_init(struct Decoder *decoder, enum AVMediaType media_type, enum AVCodecID codec_id) {
+  decoder->media_type = media_type;
+  decoder->codec = avcodec_find_decoder(codec_id);
 
   if (!decoder->codec) {
     return -1;
   }
 
-  if(decoder->media_type == AVMEDIA_TYPE_VIDEO && strcmp(out_format, "nil") != 0) {
-    decoder->out_format = av_get_pix_fmt(out_format);
-    if (decoder->out_format == AV_PIX_FMT_NONE) {
-      return -1;
-    }
-  }
-
   decoder->c = avcodec_alloc_context3(decoder->codec);
   if (!decoder->c) {
     return -1;
@@ -74,7 +53,7 @@ int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame) {
   return avcodec_receive_frame(decoder->c, frame);
 }
 
-int decoder_flush(struct Decoder *decoder, AVFrame **frames, int *frames_count) { 
+int decoder_flush(struct Decoder *decoder, AVFrame **frames, int *frames_count) {
   int ret = avcodec_send_packet(decoder->c, NULL);
   if (ret != 0) {
     return ret;

diff --git a/c_src/xav/decoder.h b/c_src/xav/decoder.h
@@ -8,7 +8,6 @@
 
 struct Decoder {
   enum AVMediaType media_type;
-  enum AVPixelFormat out_format;
   AVFrame *frame;
   AVPacket *pkt;
   const AVCodec *codec;
@@ -17,7 +16,7 @@ struct Decoder {
 
 struct Decoder *decoder_alloc();
 
-int decoder_init(struct Decoder *decoder, const char *codec, const char* out_format);
+int decoder_init(struct Decoder *decoder, enum AVMediaType media_type, enum AVCodecID codec_id);
 
 int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame);
 

diff --git a/c_src/xav/utils.c b/c_src/xav/utils.c
@@ -1,6 +1,6 @@
 #include "utils.h"
-#include <libavutil/mathematics.h>
 #include <libavutil/imgutils.h>
+#include <libavutil/mathematics.h>
 #include <libavutil/opt.h>
 #include <stdint.h>
 
@@ -21,14 +21,14 @@ ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg) {
 }
 
 ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples,
-                                         int out_size, const char *out_format, int pts) {
+                                         int out_size, enum AVSampleFormat out_format, int pts) {
   ERL_NIF_TERM data_term;
 
   unsigned char *ptr = enif_make_new_binary(env, out_size, &data_term);
   memcpy(ptr, out_data[0], out_size);
 
   ERL_NIF_TERM samples_term = enif_make_int(env, out_samples);
-  ERL_NIF_TERM format_term = enif_make_atom(env, out_format);
+  ERL_NIF_TERM format_term = enif_make_atom(env, av_get_sample_fmt_name(out_format));
   ERL_NIF_TERM pts_term = enif_make_int(env, pts);
 
   return enif_make_tuple(env, 4, data_term, format_term, samples_term, pts_term);
@@ -39,9 +39,10 @@ ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame) {
 
   int payload_size = av_image_get_buffer_size(frame->format, frame->width, frame->height, 1);
   unsigned char *ptr = enif_make_new_binary(env, payload_size, &data_term);
-
-  av_image_copy_to_buffer(ptr, payload_size, (const uint8_t *const *)frame->data, 
-                          (const int*)frame->linesize,  frame->format, frame->width, frame->height, 1);
+
+  av_image_copy_to_buffer(ptr, payload_size, (const uint8_t *const *)frame->data,
+                          (const int *)frame->linesize, frame->format, frame->width, frame->height,
+                          1);
 
   ERL_NIF_TERM format_term = enif_make_atom(env, av_get_pix_fmt_name(frame->format));
   ERL_NIF_TERM height_term = enif_make_int(env, frame->height);

diff --git a/c_src/xav/utils.h b/c_src/xav/utils.h
@@ -20,4 +20,4 @@ ERL_NIF_TERM xav_nif_error(ErlNifEnv *env, char *reason);
 ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg);
 ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame);
 ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples,
-                                         int out_size, const char *out_format, int pts);
+                                         int out_size, enum AVSampleFormat out_format, int pts);
diff --git a/c_src/xav/video_converter.c b/c_src/xav/video_converter.c
@@ -1,6 +1,7 @@
 #include "video_converter.h"
 
-int video_converter_convert(AVFrame *src_frame, AVFrame **dst_frame, enum AVPixelFormat out_format) {
+int video_converter_convert(AVFrame *src_frame, AVFrame **dst_frame,
+                            enum AVPixelFormat out_format) {
   int ret;
 
   *dst_frame = av_frame_alloc();
@@ -26,7 +27,6 @@ int video_converter_convert(AVFrame *src_frame, AVFrame **dst_frame, enum AVPixe
     return ret;
   }
 
-
   // is this (const uint8_t * const*) cast really correct?
   ret = sws_scale(sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize, 0,
                   src_frame->height, (*dst_frame)->data, (*dst_frame)->linesize);

diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c
@@ -17,28 +17,61 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
+  // resolve codec
   unsigned int codec_len;
   if (!enif_get_atom_length(env, argv[0], &codec_len, ERL_NIF_LATIN1)) {
     return xav_nif_raise(env, "failed_to_get_atom_length");
   }
 
   char *codec = (char *)XAV_ALLOC((codec_len + 1) * sizeof(char *));
-
   if (enif_get_atom(env, argv[0], codec, codec_len + 1, ERL_NIF_LATIN1) == 0) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
+  enum AVMediaType media_type;
+  enum AVCodecID codec_id;
+  if (strcmp(codec, "opus") == 0) {
+    media_type = AVMEDIA_TYPE_AUDIO;
+    codec_id = AV_CODEC_ID_OPUS;
+  } else if (strcmp(codec, "vp8") == 0) {
+    media_type = AVMEDIA_TYPE_VIDEO;
+    codec_id = AV_CODEC_ID_VP8;
+  } else if (strcmp(codec, "h264") == 0) {
+    media_type = AVMEDIA_TYPE_VIDEO;
+    codec_id = AV_CODEC_ID_H264;
+  } else if (strcmp(codec, "h265") == 0) {
+    media_type = AVMEDIA_TYPE_VIDEO;
+    codec_id = AV_CODEC_ID_HEVC;
+  } else {
+    return xav_nif_raise(env, "failed_to_resolve_codec");
+  }
+
+  // resolve output format
   unsigned int out_format_len;
   if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
     return xav_nif_raise(env, "failed_to_get_atom_length");
   }
 
   char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
-
   if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
+  enum AVPixelFormat out_video_fmt = AV_PIX_FMT_NONE;
+  enum AVSampleFormat out_audo_fmt = AV_SAMPLE_FMT_NONE;
+  if (media_type == AVMEDIA_TYPE_VIDEO && strcmp(out_format, "nil") != 0) {
+    out_video_fmt = av_get_pix_fmt(out_format);
+    if (out_video_fmt == AV_PIX_FMT_NONE) {
+      return xav_nif_raise(env, "unknown_out_format");
+    }
+  } else if (media_type == AVMEDIA_TYPE_AUDIO && strcmp(out_format, "nil") != 0) {
+    out_audo_fmt = av_get_sample_fmt(out_format);
+    if (out_audo_fmt == AV_SAMPLE_FMT_NONE) {
+      return xav_nif_raise(env, "unknown_out_format");
+    }
+  }
+
+  // resolve other params
   int out_sample_rate;
   if (!enif_get_int(env, argv[2], &out_sample_rate)) {
     return xav_nif_raise(env, "invalid_out_sample_rate");
@@ -53,7 +86,8 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
       enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder));
   xav_decoder->decoder = NULL;
   xav_decoder->ac = NULL;
-  xav_decoder->out_format = out_format;
+  xav_decoder->out_audio_fmt = out_audo_fmt;
+  xav_decoder->out_video_fmt = out_video_fmt;
   xav_decoder->out_sample_rate = out_sample_rate;
   xav_decoder->out_channels = out_channels;
 
@@ -62,31 +96,31 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
     return xav_nif_raise(env, "failed_to_allocate_decoder");
   }
 
-  if (decoder_init(xav_decoder->decoder, codec, xav_decoder->out_format) != 0) {
+  if (decoder_init(xav_decoder->decoder, media_type, codec_id) != 0) {
     return xav_nif_raise(env, "failed_to_init_decoder");
   }
 
   ERL_NIF_TERM decoder_term = enif_make_resource(env, xav_decoder);
   enif_release_resource(xav_decoder);
 
+  XAV_FREE(out_format);
+
   return decoder_term;
 }
 
-ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame* frame) {
+ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame *frame) {
   ERL_NIF_TERM frame_term;
   int ret;
 
   if (xav_decoder->decoder->media_type == AVMEDIA_TYPE_VIDEO) {
     XAV_LOG_DEBUG("Converting video to RGB");
 
-    int out_pix_fmt = xav_decoder->decoder->out_format;
-
-    if (out_pix_fmt == AV_PIX_FMT_NONE) {
+    if (xav_decoder->out_video_fmt == AV_PIX_FMT_NONE) {
       return xav_nif_video_frame_to_term(env, frame);
     }
 
     AVFrame *dst_frame;
-    ret = video_converter_convert(frame, &dst_frame, out_pix_fmt);
+    ret = video_converter_convert(frame, &dst_frame, xav_decoder->out_video_fmt);
     if (ret <= 0) {
       return xav_nif_raise(env, "failed_to_decode");
     }
@@ -104,7 +138,7 @@ ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame* fr
     if (xav_decoder->ac == NULL) {
       ret = init_audio_converter(xav_decoder);
       if (ret < 0) {
-        return xav_nif_raise(env, "failed_to_init_converter");;
+        return xav_nif_raise(env, "failed_to_init_converter");
       }
     }
 
@@ -113,15 +147,8 @@ ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame* fr
       return xav_nif_raise(env, "failed_to_decode");
     }
 
-    const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt);
-
-    if (strcmp(out_format, "flt") == 0) {
-      out_format = "f32";
-    } else if (strcmp(out_format, "dbl") == 0) {
-      out_format = "f64";
-    }
-
-    frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, frame->pts);
+    frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size,
+                                             xav_decoder->out_audio_fmt, frame->pts);
 
     av_freep(&out_data[0]);
   }
@@ -229,23 +256,12 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) {
     out_sample_rate = xav_decoder->out_sample_rate;
   }
 
-  enum AVSampleFormat out_sample_fmt;
-  if (strcmp(xav_decoder->out_format, "u8") == 0) {
-    out_sample_fmt = AV_SAMPLE_FMT_U8;
-  } else if (strcmp(xav_decoder->out_format, "s16") == 0) {
-    out_sample_fmt = AV_SAMPLE_FMT_S16;
-  } else if (strcmp(xav_decoder->out_format, "s32") == 0) {
-    out_sample_fmt = AV_SAMPLE_FMT_S32;
-  } else if (strcmp(xav_decoder->out_format, "s64") == 0) {
-    out_sample_fmt = AV_SAMPLE_FMT_S64;
-  } else if (strcmp(xav_decoder->out_format, "f32") == 0) {
-    out_sample_fmt = AV_SAMPLE_FMT_FLT;
-  } else if (strcmp(xav_decoder->out_format, "f64") == 0) {
-    out_sample_fmt = AV_SAMPLE_FMT_DBL;
-  } else if (strcmp(xav_decoder->out_format, "nil") == 0) {
-    out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0);
-  } else {
-    return -1;
+  // If user didn't request any specific format,
+  // just take the original format but in the packed form.
+  // We need to call this function here, as in the decoder_init we don't know
+  // what is the sample_fmt yet.
+  if (xav_decoder->out_audio_fmt == AV_SAMPLE_FMT_NONE) {
+    xav_decoder->out_audio_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0);
   }
 
   struct ChannelLayout in_chlayout, out_chlayout;
@@ -267,7 +283,7 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) {
 
   return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate,
                               xav_decoder->decoder->c->sample_fmt, out_chlayout, out_sample_rate,
-                              out_sample_fmt);
+                              xav_decoder->out_audio_fmt);
 }
 
 void free_xav_decoder(ErlNifEnv *env, void *obj) {

diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h
@@ -1,10 +1,13 @@
 #include "audio_converter.h"
 #include "decoder.h"
 
+#include <libavutil/pixfmt.h>
+
 struct XavDecoder {
   struct Decoder *decoder;
   struct AudioConverter *ac;
-  char *out_format;
+  enum AVPixelFormat out_video_fmt;
+  enum AVSampleFormat out_audio_fmt;
   int out_sample_rate;
   int out_channels;
 };
diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c
@@ -169,16 +169,9 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
       return xav_nif_raise(env, "failed_to_read");
     }
 
-    const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt);
-
-    if (strcmp(out_format, "flt") == 0) {
-      out_format = "f32";
-    } else if (strcmp(out_format, "dbl") == 0) {
-      out_format = "f64";
-    }
-
-    frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format,
-                                             xav_reader->reader->frame->pts);
+    frame_term =
+        xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size,
+                                    xav_reader->ac->out_sample_fmt, xav_reader->reader->frame->pts);
     av_freep(&out_data[0]);
   }
 

diff --git a/lib/decoder.ex b/lib/decoder.ex
@@ -82,13 +82,15 @@ defmodule Xav.Decoder do
         :ok
 
       {:ok, {data, format, width, height, pts}} ->
+        format = normalize_format(format)
         {:ok, Xav.Frame.new(data, format, width, height, pts)}
 
       # Sometimes, audio converter might not return data immediately.
       {:ok, {"", _format, _samples, _pts}} ->
         :ok
 
       {:ok, {data, format, samples, pts}} ->
+        format = normalize_format(format)
         {:ok, Xav.Frame.new(data, format, samples, pts)}
 
       {:error, _reason} = error ->
@@ -123,4 +125,9 @@ defmodule Xav.Decoder do
       {:error, reason} -> raise "Failed to flush decoder: #{inspect(reason)}"
     end
   end
+
+  # Use the same formats as Nx
+  defp normalize_format(:flt), do: :f32
+  defp normalize_format(:dbl), do: :f64
+  defp normalize_format(other), do: other
 end