Add scaling support to converter and decoder (#27)

elixir-webrtc · Jan 19, 2025 · 56253a4 · 56253a4
1 parent b52115d
commit 56253a4
Show file tree

Hide file tree

Showing 15 changed files with 334 additions and 164 deletions.
diff --git a/c_src/xav/video_converter.c b/c_src/xav/video_converter.c
@@ -1,41 +1,65 @@
 #include "video_converter.h"
 #include "utils.h"
 
-static inline unsigned int video_converter_resolution_changed(struct VideoConverter *converter, AVFrame *frame) {
-  return converter->in_format != frame->format || 
-          converter->in_width != frame->width || 
-          converter->in_height != frame->height;
+static inline unsigned int video_converter_resolution_changed(struct VideoConverter *converter,
+                                                              AVFrame *frame) {
+  return converter->in_format != frame->format || converter->in_width != frame->width ||
+         converter->in_height != frame->height;
 }
 
 struct VideoConverter *video_converter_alloc() {
   struct VideoConverter *converter =
       (struct VideoConverter *)XAV_ALLOC(sizeof(struct VideoConverter));
-  if(converter) {
+  if (converter) {
     converter->sws_ctx = NULL;
     converter->dst_frame = av_frame_alloc();
   }
   return converter;
 }
 
-int video_converter_init(struct VideoConverter *converter, int in_width, int in_height, 
-                          enum AVPixelFormat in_format, enum AVPixelFormat out_format) {                            
+int video_converter_init(struct VideoConverter *converter, int in_width, int in_height,
+                         enum AVPixelFormat in_format, int out_width, int out_height,
+                         enum AVPixelFormat out_format) {
   converter->in_width = in_width;
   converter->in_height = in_height;
   converter->in_format = in_format;
-  converter->out_format = out_format;
 
-  av_frame_unref(converter->dst_frame);
+  converter->out_width = out_width;
+  converter->out_height = out_height;
+  converter->out_format = out_format;
 
-  converter->dst_frame->width = in_width;
-  converter->dst_frame->height = in_height;
-  converter->dst_frame->format = out_format;
+  AVFrame *dst_frame = converter->dst_frame;
+  av_frame_unref(dst_frame);
+
+  dst_frame->format = out_format;
+
+  if (out_width == -1 && out_height == -1) {
+    dst_frame->width = in_width;
+    dst_frame->height = in_height;
+  } else if (out_width == -1) {
+    int width = in_width * out_height / in_height;
+    width = width + (width % 2);
+
+    dst_frame->width = width;
+    dst_frame->height = out_height;
+  } else if (out_height == -1) {
+    int height = in_height * out_width / in_width;
+    height = height + (height % 2);
+
+    dst_frame->width = out_width;
+    dst_frame->height = height;
+  } else {
+    dst_frame->width = out_width;
+    dst_frame->height = out_height;
+  }
 
-  int ret = av_frame_get_buffer(converter->dst_frame, 0);
+  int ret = av_frame_get_buffer(dst_frame, 0);
   if (ret < 0)
     return ret;
 
-  converter->sws_ctx = sws_getContext(in_width, in_height, in_format, in_width, in_height, out_format, 
-                                  SWS_BILINEAR, NULL, NULL, NULL);
+  converter->sws_ctx =
+      sws_getContext(in_width, in_height, in_format, dst_frame->width, dst_frame->height,
+                     dst_frame->format, SWS_BILINEAR, NULL, NULL, NULL);
 
   if (!converter->sws_ctx) {
     XAV_LOG_DEBUG("Couldn't get sws context");
@@ -51,8 +75,8 @@ int video_converter_convert(struct VideoConverter *converter, AVFrame *src_frame
   if (video_converter_resolution_changed(converter, src_frame)) {
     XAV_LOG_DEBUG("Frame resolution changed");
     sws_freeContext(converter->sws_ctx);
-    ret = video_converter_init(converter, src_frame->width, src_frame->height, 
-                                src_frame->format, converter->out_format);
+    ret = video_converter_init(converter, src_frame->width, src_frame->height, src_frame->format,
+                               converter->out_width, converter->out_height, converter->out_format);
     if (ret < 0) {
       return ret;
     }
@@ -61,12 +85,13 @@ int video_converter_convert(struct VideoConverter *converter, AVFrame *src_frame
   converter->dst_frame->pts = src_frame->pts;
 
   // is this (const uint8_t * const*) cast really correct?
-  return sws_scale(converter->sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize, 0,
-                  src_frame->height, converter->dst_frame->data, converter->dst_frame->linesize);
+  return sws_scale(converter->sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize,
+                   0, src_frame->height, converter->dst_frame->data,
+                   converter->dst_frame->linesize);
 }
 
 void video_converter_free(struct VideoConverter **converter) {
-  struct VideoConverter* vc = *converter;
+  struct VideoConverter *vc = *converter;
   if (vc != NULL) {
     if (vc->sws_ctx != NULL) {
       sws_freeContext((*converter)->sws_ctx);

diff --git a/c_src/xav/video_converter.h b/c_src/xav/video_converter.h
@@ -6,18 +6,21 @@
 #include <stdint.h>
 
 struct VideoConverter {
-    struct SwsContext *sws_ctx;
-    int in_width;
-    int in_height;
-    enum AVPixelFormat in_format;
-    enum AVPixelFormat out_format;
-    AVFrame *dst_frame;
+  struct SwsContext *sws_ctx;
+  int in_width;
+  int in_height;
+  enum AVPixelFormat in_format;
+  int out_width;
+  int out_height;
+  enum AVPixelFormat out_format;
+  AVFrame *dst_frame;
 };
 
 struct VideoConverter *video_converter_alloc();
 
-int video_converter_init(struct VideoConverter* converter, int in_width, int in_height, 
-                         enum AVPixelFormat in_format, enum AVPixelFormat out_format);
+int video_converter_init(struct VideoConverter *converter, int in_width, int in_height,
+                         enum AVPixelFormat in_format, int out_width, int out_height,
+                         enum AVPixelFormat out_format);
 
 int video_converter_convert(struct VideoConverter *converter, AVFrame *src_frame);
 

diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c
@@ -12,19 +12,17 @@ void free_frames(AVFrame **frames, int size) {
   }
 }
 
-ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 4) {
+ERL_NIF_TERM new (ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+  if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
-  // resolve codec
-  unsigned int codec_len;
-  if (!enif_get_atom_length(env, argv[0], &codec_len, ERL_NIF_LATIN1)) {
-    return xav_nif_raise(env, "failed_to_get_atom_length");
-  }
+  ERL_NIF_TERM ret;
+  char *codec = NULL;
+  char *out_format = NULL;
 
-  char *codec = (char *)XAV_ALLOC((codec_len + 1) * sizeof(char *));
-  if (enif_get_atom(env, argv[0], codec, codec_len + 1, ERL_NIF_LATIN1) == 0) {
+  // resolve codec
+  if (!xav_get_atom(env, argv[0], &codec)) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
@@ -39,74 +37,94 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   } else if (strcmp(codec, "h264") == 0) {
     media_type = AVMEDIA_TYPE_VIDEO;
     codec_id = AV_CODEC_ID_H264;
-  } else if (strcmp(codec, "h265") == 0) {
+  } else if (strcmp(codec, "h265") == 0 || strcmp(codec, "hevc") == 0) {
     media_type = AVMEDIA_TYPE_VIDEO;
     codec_id = AV_CODEC_ID_HEVC;
   } else {
-    return xav_nif_raise(env, "failed_to_resolve_codec");
+    ret = xav_nif_raise(env, "failed_to_resolve_codec");
+    goto clean;
   }
 
   // resolve output format
-  unsigned int out_format_len;
-  if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
-    return xav_nif_raise(env, "failed_to_get_atom_length");
-  }
-
-  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
-  if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
-    return xav_nif_raise(env, "failed_to_get_atom");
+  if (!xav_get_atom(env, argv[1], &out_format)) {
+    ret = xav_nif_raise(env, "failed_to_get_atom");
+    goto clean;
   }
 
   enum AVPixelFormat out_video_fmt = AV_PIX_FMT_NONE;
   enum AVSampleFormat out_audo_fmt = AV_SAMPLE_FMT_NONE;
   if (media_type == AVMEDIA_TYPE_VIDEO && strcmp(out_format, "nil") != 0) {
     out_video_fmt = av_get_pix_fmt(out_format);
     if (out_video_fmt == AV_PIX_FMT_NONE) {
-      return xav_nif_raise(env, "unknown_out_format");
+      ret = xav_nif_raise(env, "unknown_out_format");
+      goto clean;
     }
   } else if (media_type == AVMEDIA_TYPE_AUDIO && strcmp(out_format, "nil") != 0) {
     out_audo_fmt = av_get_sample_fmt(out_format);
     if (out_audo_fmt == AV_SAMPLE_FMT_NONE) {
-      return xav_nif_raise(env, "unknown_out_format");
+      ret = xav_nif_raise(env, "unknown_out_format");
+      goto clean;
     }
   }
 
   // resolve other params
   int out_sample_rate;
   if (!enif_get_int(env, argv[2], &out_sample_rate)) {
-    return xav_nif_raise(env, "invalid_out_sample_rate");
+    ret = xav_nif_raise(env, "invalid_out_sample_rate");
+    goto clean;
   }
 
   int out_channels;
   if (!enif_get_int(env, argv[3], &out_channels)) {
-    return xav_nif_raise(env, "invalid_out_channels");
+    ret = xav_nif_raise(env, "invalid_out_channels");
+    goto clean;
+  }
+
+  int out_width;
+  if (!enif_get_int(env, argv[4], &out_width)) {
+    ret = xav_nif_raise(env, "failed_to_get_int");
+    goto clean;
+  }
+
+  int out_height;
+  if (!enif_get_int(env, argv[5], &out_height)) {
+    ret = xav_nif_raise(env, "failed_to_get_int");
+    goto clean;
   }
 
   struct XavDecoder *xav_decoder =
       enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder));
   xav_decoder->decoder = NULL;
   xav_decoder->ac = NULL;
   xav_decoder->vc = NULL;
-  xav_decoder->out_audio_fmt = out_audo_fmt;
   xav_decoder->out_video_fmt = out_video_fmt;
+  xav_decoder->out_width = out_width;
+  xav_decoder->out_height = out_height;
+  xav_decoder->out_audio_fmt = out_audo_fmt;
   xav_decoder->out_sample_rate = out_sample_rate;
   xav_decoder->out_channels = out_channels;
 
   xav_decoder->decoder = decoder_alloc();
   if (xav_decoder->decoder == NULL) {
-    return xav_nif_raise(env, "failed_to_allocate_decoder");
+    ret = xav_nif_raise(env, "failed_to_allocate_decoder");
+    goto clean;
   }
 
   if (decoder_init(xav_decoder->decoder, media_type, codec_id) != 0) {
-    return xav_nif_raise(env, "failed_to_init_decoder");
+    ret = xav_nif_raise(env, "failed_to_init_decoder");
+    goto clean;
   }
 
-  ERL_NIF_TERM decoder_term = enif_make_resource(env, xav_decoder);
+  ret = enif_make_resource(env, xav_decoder);
   enif_release_resource(xav_decoder);
 
-  XAV_FREE(out_format);
+clean:
+  if (codec != NULL)
+    XAV_FREE(codec);
+  if (out_format != NULL)
+    XAV_FREE(out_format);
 
-  return decoder_term;
+  return ret;
 }
 
 ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame *frame) {
@@ -116,7 +134,9 @@ ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame *fr
   if (xav_decoder->decoder->media_type == AVMEDIA_TYPE_VIDEO) {
     XAV_LOG_DEBUG("Converting video to RGB");
 
-    if (xav_decoder->out_video_fmt == AV_PIX_FMT_NONE) {
+    // no pixel format conversion and no scaling
+    if (xav_decoder->out_video_fmt == AV_PIX_FMT_NONE && xav_decoder->out_width == -1 &&
+        xav_decoder->out_height == -1) {
       return xav_nif_video_frame_to_term(env, frame);
     }
 
@@ -299,8 +319,12 @@ static int init_video_converter(struct XavDecoder *xav_decoder, AVFrame *frame)
     return -1;
   }
 
-  return video_converter_init(xav_decoder->vc, frame->width, frame->height, 
-                                  frame->format, xav_decoder->out_video_fmt);
+  enum AVPixelFormat out_format = xav_decoder->out_video_fmt;
+  if (out_format == AV_PIX_FMT_NONE)
+    out_format = frame->format;
+
+  return video_converter_init(xav_decoder->vc, frame->width, frame->height, frame->format,
+                              xav_decoder->out_width, xav_decoder->out_height, out_format);
 }
 
 void free_xav_decoder(ErlNifEnv *env, void *obj) {
@@ -319,7 +343,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 4, new},
+static ErlNifFunc xav_funcs[] = {{"new", 6, new},
                                  {"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND},
                                  {"flush", 1, flush, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 

diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h
@@ -1,14 +1,18 @@
 #include "audio_converter.h"
-#include "video_converter.h"
 #include "decoder.h"
+#include "video_converter.h"
 
 #include <libavutil/pixfmt.h>
 
 struct XavDecoder {
   struct Decoder *decoder;
-  struct AudioConverter *ac;
+  // Video params
   struct VideoConverter *vc;
   enum AVPixelFormat out_video_fmt;
+  int out_width;
+  int out_height;
+  // Audio params
+  struct AudioConverter *ac;
   enum AVSampleFormat out_audio_fmt;
   int out_sample_rate;
   int out_channels;

diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c
@@ -5,7 +5,7 @@ static int init_video_converter(struct XavReader *xav_reader, AVFrame *frame);
 
 ErlNifResourceType *xav_reader_resource_type;
 
-ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+ERL_NIF_TERM new (ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
@@ -290,8 +290,8 @@ static int init_video_converter(struct XavReader *xav_reader, AVFrame *frame) {
     return -1;
   }
 
-  return video_converter_init(xav_reader->vc, frame->width, frame->height, 
-                                  frame->format, AV_PIX_FMT_RGB24);
+  return video_converter_init(xav_reader->vc, frame->width, frame->height, frame->format,
+                              frame->width, frame->height, AV_PIX_FMT_RGB24);
 }
 
 void free_xav_reader(ErlNifEnv *env, void *obj) {

diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h
@@ -1,6 +1,6 @@
 #include "audio_converter.h"
-#include "video_converter.h"
 #include "reader.h"
+#include "video_converter.h"
 
 struct XavReader {
   struct Reader *reader;