From 56253a4fb07ffb87d4bd94743bfc31cee5eaf4aa Mon Sep 17 00:00:00 2001
From: Billal Ghilas <84322223+gBillal@users.noreply.github.com>
Date: Sun, 19 Jan 2025 20:09:09 +0100
Subject: [PATCH] Add scaling support to converter and decoder (#27)

---
 c_src/xav/video_converter.c     | 65 ++++++++++++++++-------
 c_src/xav/video_converter.h     | 19 ++++---
 c_src/xav/xav_decoder.c         | 90 +++++++++++++++++++------------
 c_src/xav/xav_decoder.h         |  8 ++-
 c_src/xav/xav_reader.c          |  6 +--
 c_src/xav/xav_reader.h          |  2 +-
 c_src/xav/xav_video_converter.c | 93 +++++++++++++++++++++------------
 c_src/xav/xav_video_converter.h | 10 ++--
 lib/decoder.ex                  |  8 ++-
 lib/decoder_nif.ex              |  4 +-
 lib/frame.ex                    |  7 ++-
 lib/video_converter.ex          | 84 ++++++++++++++++++++++-------
 lib/video_converter_nif.ex      |  2 +-
 test/decoder_test.exs           |  9 ++++
 test/video_converter_test.exs   | 91 ++++++++++++++++++++------------
 15 files changed, 334 insertions(+), 164 deletions(-)

diff --git a/c_src/xav/video_converter.c b/c_src/xav/video_converter.c
index ded0f78..6565af7 100644
--- a/c_src/xav/video_converter.c
+++ b/c_src/xav/video_converter.c
@@ -1,41 +1,65 @@
 #include "video_converter.h"
 #include "utils.h"
 
-static inline unsigned int video_converter_resolution_changed(struct VideoConverter *converter, AVFrame *frame) {
-  return converter->in_format != frame->format || 
-          converter->in_width != frame->width || 
-          converter->in_height != frame->height;
+static inline unsigned int video_converter_resolution_changed(struct VideoConverter *converter,
+                                                              AVFrame *frame) {
+  return converter->in_format != frame->format || converter->in_width != frame->width ||
+         converter->in_height != frame->height;
 }
 
 struct VideoConverter *video_converter_alloc() {
   struct VideoConverter *converter =
       (struct VideoConverter *)XAV_ALLOC(sizeof(struct VideoConverter));
-  if(converter) {
+  if (converter) {
     converter->sws_ctx = NULL;
     converter->dst_frame = av_frame_alloc();
   }
   return converter;
 }
 
-int video_converter_init(struct VideoConverter *converter, int in_width, int in_height, 
-                          enum AVPixelFormat in_format, enum AVPixelFormat out_format) {                            
+int video_converter_init(struct VideoConverter *converter, int in_width, int in_height,
+                         enum AVPixelFormat in_format, int out_width, int out_height,
+                         enum AVPixelFormat out_format) {
   converter->in_width = in_width;
   converter->in_height = in_height;
   converter->in_format = in_format;
-  converter->out_format = out_format;
 
-  av_frame_unref(converter->dst_frame);
+  converter->out_width = out_width;
+  converter->out_height = out_height;
+  converter->out_format = out_format;
 
-  converter->dst_frame->width = in_width;
-  converter->dst_frame->height = in_height;
-  converter->dst_frame->format = out_format;
+  AVFrame *dst_frame = converter->dst_frame;
+  av_frame_unref(dst_frame);
+
+  dst_frame->format = out_format;
+
+  if (out_width == -1 && out_height == -1) {
+    dst_frame->width = in_width;
+    dst_frame->height = in_height;
+  } else if (out_width == -1) {
+    int width = in_width * out_height / in_height;
+    width = width + (width % 2);
+
+    dst_frame->width = width;
+    dst_frame->height = out_height;
+  } else if (out_height == -1) {
+    int height = in_height * out_width / in_width;
+    height = height + (height % 2);
+
+    dst_frame->width = out_width;
+    dst_frame->height = height;
+  } else {
+    dst_frame->width = out_width;
+    dst_frame->height = out_height;
+  }
 
-  int ret = av_frame_get_buffer(converter->dst_frame, 0);
+  int ret = av_frame_get_buffer(dst_frame, 0);
   if (ret < 0)
     return ret;
 
-  converter->sws_ctx = sws_getContext(in_width, in_height, in_format, in_width, in_height, out_format, 
-                                  SWS_BILINEAR, NULL, NULL, NULL);
+  converter->sws_ctx =
+      sws_getContext(in_width, in_height, in_format, dst_frame->width, dst_frame->height,
+                     dst_frame->format, SWS_BILINEAR, NULL, NULL, NULL);
 
   if (!converter->sws_ctx) {
     XAV_LOG_DEBUG("Couldn't get sws context");
@@ -51,8 +75,8 @@ int video_converter_convert(struct VideoConverter *converter, AVFrame *src_frame
   if (video_converter_resolution_changed(converter, src_frame)) {
     XAV_LOG_DEBUG("Frame resolution changed");
     sws_freeContext(converter->sws_ctx);
-    ret = video_converter_init(converter, src_frame->width, src_frame->height, 
-                                src_frame->format, converter->out_format);
+    ret = video_converter_init(converter, src_frame->width, src_frame->height, src_frame->format,
+                               converter->out_width, converter->out_height, converter->out_format);
     if (ret < 0) {
       return ret;
     }
@@ -61,12 +85,13 @@ int video_converter_convert(struct VideoConverter *converter, AVFrame *src_frame
   converter->dst_frame->pts = src_frame->pts;
 
   // is this (const uint8_t * const*) cast really correct?
-  return sws_scale(converter->sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize, 0,
-                  src_frame->height, converter->dst_frame->data, converter->dst_frame->linesize);
+  return sws_scale(converter->sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize,
+                   0, src_frame->height, converter->dst_frame->data,
+                   converter->dst_frame->linesize);
 }
 
 void video_converter_free(struct VideoConverter **converter) {
-  struct VideoConverter* vc = *converter;
+  struct VideoConverter *vc = *converter;
   if (vc != NULL) {
     if (vc->sws_ctx != NULL) {
       sws_freeContext((*converter)->sws_ctx);
diff --git a/c_src/xav/video_converter.h b/c_src/xav/video_converter.h
index c716fc7..4e5d704 100644
--- a/c_src/xav/video_converter.h
+++ b/c_src/xav/video_converter.h
@@ -6,18 +6,21 @@
 #include <stdint.h>
 
 struct VideoConverter {
-    struct SwsContext *sws_ctx;
-    int in_width;
-    int in_height;
-    enum AVPixelFormat in_format;
-    enum AVPixelFormat out_format;
-    AVFrame *dst_frame;
+  struct SwsContext *sws_ctx;
+  int in_width;
+  int in_height;
+  enum AVPixelFormat in_format;
+  int out_width;
+  int out_height;
+  enum AVPixelFormat out_format;
+  AVFrame *dst_frame;
 };
 
 struct VideoConverter *video_converter_alloc();
 
-int video_converter_init(struct VideoConverter* converter, int in_width, int in_height, 
-                         enum AVPixelFormat in_format, enum AVPixelFormat out_format);
+int video_converter_init(struct VideoConverter *converter, int in_width, int in_height,
+                         enum AVPixelFormat in_format, int out_width, int out_height,
+                         enum AVPixelFormat out_format);
 
 int video_converter_convert(struct VideoConverter *converter, AVFrame *src_frame);
 
diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c
index f5b9445..4fdd462 100644
--- a/c_src/xav/xav_decoder.c
+++ b/c_src/xav/xav_decoder.c
@@ -12,19 +12,17 @@ void free_frames(AVFrame **frames, int size) {
   }
 }
 
-ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 4) {
+ERL_NIF_TERM new (ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+  if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
 
-  // resolve codec
-  unsigned int codec_len;
-  if (!enif_get_atom_length(env, argv[0], &codec_len, ERL_NIF_LATIN1)) {
-    return xav_nif_raise(env, "failed_to_get_atom_length");
-  }
+  ERL_NIF_TERM ret;
+  char *codec = NULL;
+  char *out_format = NULL;
 
-  char *codec = (char *)XAV_ALLOC((codec_len + 1) * sizeof(char *));
-  if (enif_get_atom(env, argv[0], codec, codec_len + 1, ERL_NIF_LATIN1) == 0) {
+  // resolve codec
+  if (!xav_get_atom(env, argv[0], &codec)) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
@@ -39,22 +37,18 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   } else if (strcmp(codec, "h264") == 0) {
     media_type = AVMEDIA_TYPE_VIDEO;
     codec_id = AV_CODEC_ID_H264;
-  } else if (strcmp(codec, "h265") == 0) {
+  } else if (strcmp(codec, "h265") == 0 || strcmp(codec, "hevc") == 0) {
     media_type = AVMEDIA_TYPE_VIDEO;
     codec_id = AV_CODEC_ID_HEVC;
   } else {
-    return xav_nif_raise(env, "failed_to_resolve_codec");
+    ret = xav_nif_raise(env, "failed_to_resolve_codec");
+    goto clean;
   }
 
   // resolve output format
-  unsigned int out_format_len;
-  if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) {
-    return xav_nif_raise(env, "failed_to_get_atom_length");
-  }
-
-  char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *));
-  if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) {
-    return xav_nif_raise(env, "failed_to_get_atom");
+  if (!xav_get_atom(env, argv[1], &out_format)) {
+    ret = xav_nif_raise(env, "failed_to_get_atom");
+    goto clean;
   }
 
   enum AVPixelFormat out_video_fmt = AV_PIX_FMT_NONE;
@@ -62,24 +56,40 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   if (media_type == AVMEDIA_TYPE_VIDEO && strcmp(out_format, "nil") != 0) {
     out_video_fmt = av_get_pix_fmt(out_format);
     if (out_video_fmt == AV_PIX_FMT_NONE) {
-      return xav_nif_raise(env, "unknown_out_format");
+      ret = xav_nif_raise(env, "unknown_out_format");
+      goto clean;
     }
   } else if (media_type == AVMEDIA_TYPE_AUDIO && strcmp(out_format, "nil") != 0) {
     out_audo_fmt = av_get_sample_fmt(out_format);
     if (out_audo_fmt == AV_SAMPLE_FMT_NONE) {
-      return xav_nif_raise(env, "unknown_out_format");
+      ret = xav_nif_raise(env, "unknown_out_format");
+      goto clean;
     }
   }
 
   // resolve other params
   int out_sample_rate;
   if (!enif_get_int(env, argv[2], &out_sample_rate)) {
-    return xav_nif_raise(env, "invalid_out_sample_rate");
+    ret = xav_nif_raise(env, "invalid_out_sample_rate");
+    goto clean;
   }
 
   int out_channels;
   if (!enif_get_int(env, argv[3], &out_channels)) {
-    return xav_nif_raise(env, "invalid_out_channels");
+    ret = xav_nif_raise(env, "invalid_out_channels");
+    goto clean;
+  }
+
+  int out_width;
+  if (!enif_get_int(env, argv[4], &out_width)) {
+    ret = xav_nif_raise(env, "failed_to_get_int");
+    goto clean;
+  }
+
+  int out_height;
+  if (!enif_get_int(env, argv[5], &out_height)) {
+    ret = xav_nif_raise(env, "failed_to_get_int");
+    goto clean;
   }
 
   struct XavDecoder *xav_decoder =
@@ -87,26 +97,34 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   xav_decoder->decoder = NULL;
   xav_decoder->ac = NULL;
   xav_decoder->vc = NULL;
-  xav_decoder->out_audio_fmt = out_audo_fmt;
   xav_decoder->out_video_fmt = out_video_fmt;
+  xav_decoder->out_width = out_width;
+  xav_decoder->out_height = out_height;
+  xav_decoder->out_audio_fmt = out_audo_fmt;
   xav_decoder->out_sample_rate = out_sample_rate;
   xav_decoder->out_channels = out_channels;
 
   xav_decoder->decoder = decoder_alloc();
   if (xav_decoder->decoder == NULL) {
-    return xav_nif_raise(env, "failed_to_allocate_decoder");
+    ret = xav_nif_raise(env, "failed_to_allocate_decoder");
+    goto clean;
   }
 
   if (decoder_init(xav_decoder->decoder, media_type, codec_id) != 0) {
-    return xav_nif_raise(env, "failed_to_init_decoder");
+    ret = xav_nif_raise(env, "failed_to_init_decoder");
+    goto clean;
   }
 
-  ERL_NIF_TERM decoder_term = enif_make_resource(env, xav_decoder);
+  ret = enif_make_resource(env, xav_decoder);
   enif_release_resource(xav_decoder);
 
-  XAV_FREE(out_format);
+clean:
+  if (codec != NULL)
+    XAV_FREE(codec);
+  if (out_format != NULL)
+    XAV_FREE(out_format);
 
-  return decoder_term;
+  return ret;
 }
 
 ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame *frame) {
@@ -116,7 +134,9 @@ ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame *fr
   if (xav_decoder->decoder->media_type == AVMEDIA_TYPE_VIDEO) {
     XAV_LOG_DEBUG("Converting video to RGB");
 
-    if (xav_decoder->out_video_fmt == AV_PIX_FMT_NONE) {
+    // no pixel format conversion and no scaling
+    if (xav_decoder->out_video_fmt == AV_PIX_FMT_NONE && xav_decoder->out_width == -1 &&
+        xav_decoder->out_height == -1) {
       return xav_nif_video_frame_to_term(env, frame);
     }
 
@@ -299,8 +319,12 @@ static int init_video_converter(struct XavDecoder *xav_decoder, AVFrame *frame)
     return -1;
   }
 
-  return video_converter_init(xav_decoder->vc, frame->width, frame->height, 
-                                  frame->format, xav_decoder->out_video_fmt);
+  enum AVPixelFormat out_format = xav_decoder->out_video_fmt;
+  if (out_format == AV_PIX_FMT_NONE)
+    out_format = frame->format;
+
+  return video_converter_init(xav_decoder->vc, frame->width, frame->height, frame->format,
+                              xav_decoder->out_width, xav_decoder->out_height, out_format);
 }
 
 void free_xav_decoder(ErlNifEnv *env, void *obj) {
@@ -319,7 +343,7 @@ void free_xav_decoder(ErlNifEnv *env, void *obj) {
   }
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 4, new},
+static ErlNifFunc xav_funcs[] = {{"new", 6, new},
                                  {"decode", 4, decode, ERL_NIF_DIRTY_JOB_CPU_BOUND},
                                  {"flush", 1, flush, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h
index 15813fa..9b29111 100644
--- a/c_src/xav/xav_decoder.h
+++ b/c_src/xav/xav_decoder.h
@@ -1,14 +1,18 @@
 #include "audio_converter.h"
-#include "video_converter.h"
 #include "decoder.h"
+#include "video_converter.h"
 
 #include <libavutil/pixfmt.h>
 
 struct XavDecoder {
   struct Decoder *decoder;
-  struct AudioConverter *ac;
+  // Video params
   struct VideoConverter *vc;
   enum AVPixelFormat out_video_fmt;
+  int out_width;
+  int out_height;
+  // Audio params
+  struct AudioConverter *ac;
   enum AVSampleFormat out_audio_fmt;
   int out_sample_rate;
   int out_channels;
diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c
index b0ef045..33c612c 100644
--- a/c_src/xav/xav_reader.c
+++ b/c_src/xav/xav_reader.c
@@ -5,7 +5,7 @@ static int init_video_converter(struct XavReader *xav_reader, AVFrame *frame);
 
 ErlNifResourceType *xav_reader_resource_type;
 
-ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+ERL_NIF_TERM new (ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   if (argc != 6) {
     return xav_nif_raise(env, "invalid_arg_count");
   }
@@ -290,8 +290,8 @@ static int init_video_converter(struct XavReader *xav_reader, AVFrame *frame) {
     return -1;
   }
 
-  return video_converter_init(xav_reader->vc, frame->width, frame->height, 
-                                  frame->format, AV_PIX_FMT_RGB24);
+  return video_converter_init(xav_reader->vc, frame->width, frame->height, frame->format,
+                              frame->width, frame->height, AV_PIX_FMT_RGB24);
 }
 
 void free_xav_reader(ErlNifEnv *env, void *obj) {
diff --git a/c_src/xav/xav_reader.h b/c_src/xav/xav_reader.h
index 4e22b8f..3849aa8 100644
--- a/c_src/xav/xav_reader.h
+++ b/c_src/xav/xav_reader.h
@@ -1,6 +1,6 @@
 #include "audio_converter.h"
-#include "video_converter.h"
 #include "reader.h"
+#include "video_converter.h"
 
 struct XavReader {
   struct Reader *reader;
diff --git a/c_src/xav/xav_video_converter.c b/c_src/xav/xav_video_converter.c
index 16dc660..8f498a5 100644
--- a/c_src/xav/xav_video_converter.c
+++ b/c_src/xav/xav_video_converter.c
@@ -1,38 +1,68 @@
 #include "xav_video_converter.h"
 
-ErlNifResourceType * xav_video_converter_resource_type;
+ErlNifResourceType *xav_video_converter_resource_type;
 
-ERL_NIF_TERM new(ErlNifEnv * env, int argc, const ERL_NIF_TERM argv[]) {
-  if (argc != 1) {
+static int init_video_converter(struct XavVideoConverter *converter) {
+  converter->vc = video_converter_alloc();
+  if (converter->vc == NULL) {
+    return -1;
+  }
+
+  AVFrame *in_frame = converter->frame;
+
+  enum AVPixelFormat out_pix_fmt = converter->out_format;
+  if (out_pix_fmt == AV_PIX_FMT_NONE) {
+    out_pix_fmt = in_frame->format;
+  }
+
+  return video_converter_init(converter->vc, in_frame->width, in_frame->height, in_frame->format,
+                              converter->out_width, converter->out_height, out_pix_fmt);
+}
+
+ERL_NIF_TERM new (ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
+  if (argc != 3) {
     return xav_nif_error(env, "invalid_arg_count");
   }
 
   ERL_NIF_TERM ret;
-  enum AVPixelFormat pix_fmt;
+  enum AVPixelFormat pix_fmt = AV_PIX_FMT_NONE;
+  int width, height;
   char *format = NULL;
 
-  if(!xav_get_atom(env, argv[0], &format)) {
+  if (!xav_get_atom(env, argv[0], &format)) {
     return xav_nif_raise(env, "failed_to_get_atom");
   }
 
-  pix_fmt = av_get_pix_fmt(format);
-  if (pix_fmt == AV_PIX_FMT_NONE) {
-    ret = xav_nif_raise(env, "unknown_format");
-    goto fail;
+  if (strcmp(format, "nil") != 0) {
+    pix_fmt = av_get_pix_fmt(format);
+    if (pix_fmt == AV_PIX_FMT_NONE) {
+      ret = xav_nif_raise(env, "unknown_format");
+      goto clean;
+    }
+  }
+
+  if (!enif_get_int(env, argv[1], &width)) {
+    ret = xav_nif_raise(env, "failed_to_get_int");
+    goto clean;
+  }
+
+  if (!enif_get_int(env, argv[2], &height)) {
+    ret = xav_nif_raise(env, "failed_to_get_int");
+    goto clean;
   }
 
-  struct XavVideoConverter *xav_video_converter = enif_alloc_resource(xav_video_converter_resource_type, 
-                                                                      sizeof(xav_video_converter));
+  struct XavVideoConverter *xav_video_converter =
+      enif_alloc_resource(xav_video_converter_resource_type, sizeof(struct XavVideoConverter));
   xav_video_converter->vc = NULL;
   xav_video_converter->frame = av_frame_alloc();
   xav_video_converter->out_format = pix_fmt;
+  xav_video_converter->out_width = width;
+  xav_video_converter->out_height = height;
 
-  ERL_NIF_TERM converter_term = enif_make_resource(env, xav_video_converter);
+  ret = enif_make_resource(env, xav_video_converter);
   enif_release_resource(xav_video_converter);
 
-  ret = xav_nif_ok(env, converter_term);
-
-fail:
+clean:
   XAV_FREE(format);
 
   return ret;
@@ -44,7 +74,8 @@ ERL_NIF_TERM convert(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   }
 
   struct XavVideoConverter *xav_video_converter;
-  if (!enif_get_resource(env, argv[0], xav_video_converter_resource_type, (void**) &xav_video_converter)) {
+  if (!enif_get_resource(env, argv[0], xav_video_converter_resource_type,
+                         (void **)&xav_video_converter)) {
     return xav_nif_raise(env, "couldnt_get_converter_resource");
   }
 
@@ -81,23 +112,16 @@ ERL_NIF_TERM convert(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   src_frame->height = height;
   src_frame->format = pix_fmt;
 
-  ret = av_image_fill_arrays(src_frame->data, src_frame->linesize, in_data.data, 
-                              src_frame->format, width, height, 1);
-
+  int int_ret = av_image_fill_arrays(src_frame->data, src_frame->linesize, in_data.data,
+                                     src_frame->format, width, height, 1);
 
-  if (ret < 0) { 
+  if (int_ret < 0) {
     ret = xav_nif_raise(env, "failed_to_fill_arrays");
     goto clean;
   }
 
   if (xav_video_converter->vc == NULL) {
-    xav_video_converter->vc = video_converter_alloc();
-    if (xav_video_converter->vc == NULL) {
-      ret = xav_nif_raise(env, "failed_to_allocate_converter");
-      goto clean;
-    }
-
-    if (video_converter_init(xav_video_converter->vc, width, height, pix_fmt, xav_video_converter->out_format) < 0) {
+    if (init_video_converter(xav_video_converter) < 0) {
       ret = xav_nif_raise(env, "failed_to_init_converter");
       goto clean;
     }
@@ -111,14 +135,15 @@ ERL_NIF_TERM convert(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) {
   ret = xav_nif_video_frame_to_term(env, xav_video_converter->vc->dst_frame);
 
 clean:
-  if (format != NULL) XAV_FREE(format);
+  if (format != NULL)
+    XAV_FREE(format);
 
   return ret;
 }
 
-void free_xav_video_converter(ErlNifEnv * env, void * obj) {
+void free_xav_video_converter(ErlNifEnv *env, void *obj) {
   XAV_LOG_DEBUG("Freeing XavVideoConverter object");
-  struct XavVideoConverter * xav_video_converter = (struct XavVideoConverter * ) obj;
+  struct XavVideoConverter *xav_video_converter = (struct XavVideoConverter *)obj;
   if (xav_video_converter->vc != NULL) {
     video_converter_free(&xav_video_converter->vc);
   }
@@ -126,12 +151,12 @@ void free_xav_video_converter(ErlNifEnv * env, void * obj) {
   av_frame_free(&xav_video_converter->frame);
 }
 
-static ErlNifFunc xav_funcs[] = {{"new", 1, new}, 
+static ErlNifFunc xav_funcs[] = {{"new", 3, new},
                                  {"convert", 5, convert, ERL_NIF_DIRTY_JOB_CPU_BOUND}};
 
-static int load(ErlNifEnv * env, void ** priv, ERL_NIF_TERM load_info) {
-  xav_video_converter_resource_type = 
-    enif_open_resource_type(env, NULL, "XavVideoConverter", free_xav_video_converter, ERL_NIF_RT_CREATE, NULL);
+static int load(ErlNifEnv *env, void **priv, ERL_NIF_TERM load_info) {
+  xav_video_converter_resource_type = enif_open_resource_type(
+      env, NULL, "XavVideoConverter", free_xav_video_converter, ERL_NIF_RT_CREATE, NULL);
   return 0;
 }
 
diff --git a/c_src/xav/xav_video_converter.h b/c_src/xav/xav_video_converter.h
index f21655e..4de5274 100644
--- a/c_src/xav/xav_video_converter.h
+++ b/c_src/xav/xav_video_converter.h
@@ -1,8 +1,10 @@
-#include "video_converter.h"
 #include "utils.h"
+#include "video_converter.h"
 
 struct XavVideoConverter {
-    struct VideoConverter *vc;
-    enum AVPixelFormat out_format;
-    AVFrame* frame;
+  struct VideoConverter *vc;
+  enum AVPixelFormat out_format;
+  int out_width;
+  int out_height;
+  AVFrame *frame;
 };
\ No newline at end of file
diff --git a/lib/decoder.ex b/lib/decoder.ex
index 9825ed4..efb79cf 100644
--- a/lib/decoder.ex
+++ b/lib/decoder.ex
@@ -16,7 +16,9 @@ defmodule Xav.Decoder do
   @type opts :: [
           out_format: Xav.Frame.format(),
           out_sample_rate: integer(),
-          out_channels: integer()
+          out_channels: integer(),
+          out_width: Xav.Frame.width(),
+          out_height: Xav.Frame.height()
         ]
 
   @doc """
@@ -59,7 +61,9 @@ defmodule Xav.Decoder do
     out_format = opts[:out_format]
     out_sample_rate = opts[:out_sample_rate] || 0
     out_channels = opts[:out_channels] || 0
-    Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels)
+    out_width = opts[:out_width] || -1
+    out_height = opts[:out_height] || -1
+    Xav.Decoder.NIF.new(codec, out_format, out_sample_rate, out_channels, out_width, out_height)
   end
 
   @doc """
diff --git a/lib/decoder_nif.ex b/lib/decoder_nif.ex
index f2468d2..4485c75 100644
--- a/lib/decoder_nif.ex
+++ b/lib/decoder_nif.ex
@@ -8,7 +8,9 @@ defmodule Xav.Decoder.NIF do
     :ok = :erlang.load_nif(path, 0)
   end
 
-  def new(_codec, _out_format, _out_sample_rate, _out_channels), do: :erlang.nif_error(:undef)
+  def new(_codec, _out_format, _out_sample_rate, _out_channels, _out_width, _out_height) do
+    :erlang.nif_error(:undef)
+  end
 
   def decode(_decoder, _data, _pts, _dts), do: :erlang.nif_error(:undef)
 
diff --git a/lib/frame.ex b/lib/frame.ex
index 8b03ba6..572c0a8 100644
--- a/lib/frame.ex
+++ b/lib/frame.ex
@@ -23,12 +23,15 @@ defmodule Xav.Frame do
 
   @type format() :: audio_format() | video_format()
 
+  @type width :: non_neg_integer() | nil
+  @type height :: non_neg_integer() | nil
+
   @type t() :: %__MODULE__{
           type: :audio | :video,
           data: binary(),
           format: format(),
-          width: non_neg_integer() | nil,
-          height: non_neg_integer() | nil,
+          width: width(),
+          height: height(),
           samples: integer() | nil,
           pts: integer()
         }
diff --git a/lib/video_converter.ex b/lib/video_converter.ex
index f5ffc89..ec6a9b8 100644
--- a/lib/video_converter.ex
+++ b/lib/video_converter.ex
@@ -2,50 +2,70 @@ defmodule Xav.VideoConverter do
   @moduledoc """
   Video samples converter.
 
-  Currently it only supports pixel format conversion.
+  It supports pixel format conversion and/or scaling.
   """
 
   alias Xav.Frame
   alias Xav.VideoConverter.NIF
 
-  @type t :: %__MODULE__{format: Frame.video_format(), converter: reference()}
+  @type t :: %__MODULE__{
+          converter: reference(),
+          out_format: Frame.video_format(),
+          out_width: Frame.width(),
+          out_height: Frame.height()
+        }
 
   @typedoc """
   Type definition for converter options.
 
-  * `format` - video format to convert to (`e.g. :rgb24`).
+  * `out_format` - video format to convert to (`e.g. :rgb24`).
+  * `out_width` - scale the video frame to this width.
+  * `out_height` - scale the video frame to this height.
+
+  If `out_width` and `out_height` are both not provided, scaling is not performed. If one of the
+  dimensions is `nil`, the other will be calculated based on the input dimensions as
+  to keep the aspect ratio.
   """
-  @type converter_opts() :: [format: Frame.video_format()]
+  @type converter_opts() :: [
+          out_format: Frame.video_format(),
+          out_width: Frame.width(),
+          out_height: Frame.height()
+        ]
 
-  @enforce_keys [:format]
-  defstruct [:format, :converter]
+  defstruct [:converter, :out_format, :out_width, :out_height]
 
   @doc """
   Creates a new video converter.
   """
-  @spec new(converter_opts()) :: {:ok, t()} | {:error, any()}
+  @spec new(converter_opts()) :: t()
   def new(converter_opts) do
-    with {:ok, converter} <- NIF.new(converter_opts[:format]) do
-      {:ok, %__MODULE__{format: converter_opts[:format], converter: converter}}
-    end
-  end
+    opts = Keyword.validate!(converter_opts, [:out_format, :out_width, :out_height])
 
-  @doc """
-  Same as `new/1` but raises an exception in case of an error.
-  """
-  @spec new!(converter_opts()) :: t()
-  def new!(converter_opts) do
-    case new(converter_opts) do
-      {:ok, ref} -> ref
-      {:error, reason} -> raise "Couldn't create a video converter. Reason: #{inspect(reason)}"
+    if is_nil(opts[:out_format]) and is_nil(opts[:out_width]) and is_nil(opts[:out_height]) do
+      raise "At least one of `out_format`, `out_width` or `out_height` must be provided"
     end
+
+    :ok = validate_converter_options(opts)
+
+    converter = NIF.new(opts[:out_format], opts[:out_width] || -1, opts[:out_height] || -1)
+
+    %__MODULE__{
+      converter: converter,
+      out_format: opts[:out_format],
+      out_width: opts[:out_width],
+      out_height: opts[:out_height]
+    }
   end
 
   @doc """
   Converts a video frame.
   """
   @spec convert(t(), Frame.t()) :: Frame.t()
-  def convert(%__MODULE__{format: format}, %Frame{format: format} = frame), do: frame
+  def convert(
+        %__MODULE__{out_format: format, out_width: nil, out_height: nil},
+        %Frame{format: format} = frame
+      ),
+      do: frame
 
   def convert(%__MODULE__{converter: converter}, frame) do
     {data, out_format, width, height, _pts} =
@@ -60,4 +80,28 @@ defmodule Xav.VideoConverter do
       pts: frame.pts
     }
   end
+
+  defp validate_converter_options([]), do: :ok
+
+  defp validate_converter_options([{_key, nil} | opts]) do
+    validate_converter_options(opts)
+  end
+
+  defp validate_converter_options([{key, value} | _opts])
+       when key in [:out_width, :out_height] and not is_integer(value) do
+    raise %ArgumentError{
+      message: "Expected an integer value for #{inspect(key)}, received: #{inspect(value)}"
+    }
+  end
+
+  defp validate_converter_options([{key, value} | _opts])
+       when key in [:out_width, :out_height] and value < 1 do
+    raise %ArgumentError{
+      message: "Invalid value for #{inspect(key)}, expected a value to be >= 1"
+    }
+  end
+
+  defp validate_converter_options([{_key, _value} | opts]) do
+    validate_converter_options(opts)
+  end
 end
diff --git a/lib/video_converter_nif.ex b/lib/video_converter_nif.ex
index e09051e..9521df1 100644
--- a/lib/video_converter_nif.ex
+++ b/lib/video_converter_nif.ex
@@ -8,7 +8,7 @@ defmodule Xav.VideoConverter.NIF do
     :ok = :erlang.load_nif(path, 0)
   end
 
-  def new(_format), do: :erlang.nif_error(:undef)
+  def new(_format, _width, _height), do: :erlang.nif_error(:undef)
 
   def convert(_converter, _frame, _width, _height, _pix_format), do: :erlang.nif_error(:undef)
 end
diff --git a/test/decoder_test.exs b/test/decoder_test.exs
index 0011cbd..dd1d643 100644
--- a/test/decoder_test.exs
+++ b/test/decoder_test.exs
@@ -365,5 +365,14 @@ defmodule Xav.DecoderTest do
 
       assert byte_size(frame) == 640 * 480 * 3
     end
+
+    test "scale video frame" do
+      decoder = Xav.Decoder.new(:vp8, out_width: 240, out_height: 180)
+
+      assert {:ok, %Xav.Frame{width: 240, height: 180, pts: 0, data: frame, format: :yuv420p}} =
+               Xav.Decoder.decode(decoder, @vp8_keyframe)
+
+      assert byte_size(frame) == 240 * 180 * 3 / 2
+    end
   end
 end
diff --git a/test/video_converter_test.exs b/test/video_converter_test.exs
index 7e22c2f..42bb0fc 100644
--- a/test/video_converter_test.exs
+++ b/test/video_converter_test.exs
@@ -1,34 +1,42 @@
 defmodule Xav.VideoConverterTest do
   use ExUnit.Case, async: true
 
-  test "new/1" do
-    assert {:ok, %Xav.VideoConverter{format: :rgb24, converter: converter}} =
-             Xav.VideoConverter.new(format: :rgb24)
+  describe "new/1" do
+    test "new converter" do
+      assert %Xav.VideoConverter{out_format: :rgb24, converter: converter} =
+               Xav.VideoConverter.new(out_format: :rgb24)
 
-    assert is_reference(converter)
-  end
+      assert is_reference(converter)
+    end
+
+    test "fails when no option is provided" do
+      assert_raise RuntimeError, fn -> Xav.VideoConverter.new(out_format: nil) end
+    end
 
-  test "new!/1" do
-    assert %Xav.VideoConverter{} = Xav.VideoConverter.new!(format: :rgb24)
-    assert_raise ErlangError, fn -> Xav.VideoConverter.new!(format: :rgb) end
+    test "fails on invalid options" do
+      assert_raise ArgumentError, fn -> Xav.VideoConverter.new(out_width: 0) end
+      assert_raise ArgumentError, fn -> Xav.VideoConverter.new(out_height: "15") end
+    end
   end
 
   describe "convert/2" do
     setup do
-      %{converter: Xav.VideoConverter.new!(format: :rgb24)}
-    end
+      frame_480p = %Xav.Frame{
+        type: :video,
+        data: File.read!("test/fixtures/video_converter/frame_480x360.yuv"),
+        format: :yuv420p,
+        width: 480,
+        height: 360,
+        pts: 0
+      }
 
-    test "convert video format", %{converter: converter} do
-      assert frame =
-               Xav.VideoConverter.convert(converter, %Xav.Frame{
-                 type: :video,
-                 data: File.read!("test/fixtures/video_converter/frame_480x360.yuv"),
-                 format: :yuv420p,
-                 width: 480,
-                 height: 360,
-                 pts: 0
-               })
+      %{
+        converter: Xav.VideoConverter.new(out_format: :rgb24),
+        frame_480p: frame_480p
+      }
+    end
 
+    test "convert video format", %{converter: converter, frame_480p: frame_480p} do
       # reference frame is generated using ffmeg
       # ffmpeg -f rawvideo -pix_fmt yuv420p -video_size 480x360 -i frame_480x360.yuv -pix_fmt rgb24 ref_frame_480x360.yuv
       ref_data = File.read!("test/fixtures/video_converter/ref_frame_480x360.rgb")
@@ -40,19 +48,11 @@ defmodule Xav.VideoConverterTest do
                width: 480,
                height: 360,
                pts: 0
-             } = frame
+             } = Xav.VideoConverter.convert(converter, frame_480p)
     end
 
-    test "converter re-init on resolution change", %{converter: converter} do
-      frame1 = %Xav.Frame{
-        type: :video,
-        data: File.read!("test/fixtures/video_converter/frame_480x360.yuv"),
-        format: :yuv420p,
-        width: 480,
-        height: 360
-      }
-
-      frame2 = %Xav.Frame{
+    test "converter re-init on resolution change", %{converter: converter, frame_480p: frame_480p} do
+      frame_360p = %Xav.Frame{
         type: :video,
         data: File.read!("test/fixtures/video_converter/frame_360x240.yuv"),
         format: :yuv420p,
@@ -61,13 +61,38 @@ defmodule Xav.VideoConverterTest do
       }
 
       assert %Xav.Frame{format: :rgb24, data: ref_frame1} =
-               Xav.VideoConverter.convert(converter, frame1)
+               Xav.VideoConverter.convert(converter, frame_480p)
 
       assert %Xav.Frame{format: :rgb24, data: ref_frame2} =
-               Xav.VideoConverter.convert(converter, frame2)
+               Xav.VideoConverter.convert(converter, frame_360p)
 
       assert byte_size(ref_frame1) == 480 * 360 * 3
       assert byte_size(ref_frame2) == 360 * 240 * 3
     end
+
+    test "scale video frame", %{frame_480p: frame_480p} do
+      converter = Xav.VideoConverter.new(out_width: 368)
+
+      assert %Xav.Frame{
+               type: :video,
+               format: :yuv420p,
+               data: data,
+               width: 368,
+               height: 276
+             } = Xav.VideoConverter.convert(converter, frame_480p)
+
+      assert byte_size(data) == 368 * 276 * 3 / 2
+    end
+
+    test "scale and convert video frame", %{frame_480p: frame_480p} do
+      converter = Xav.VideoConverter.new(out_width: 360, out_height: 240, out_format: :rgb24)
+
+      assert %Xav.Frame{
+               type: :video,
+               format: :rgb24,
+               width: 360,
+               height: 240
+             } = Xav.VideoConverter.convert(converter, frame_480p)
+    end
   end
 end