diff --git a/README.md b/README.md index 2f89f48..e760273 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,7 @@ end Decode ```elixir -decoder = Xav.Decoder.new(:vp8) +decoder = Xav.Decoder.new(:vp8, out_format: :rgb24) {:ok, %Xav.Frame{} = frame} = Xav.Decoder.decode(decoder, <<"somebinary">>) ``` @@ -52,7 +52,7 @@ Kino.Image.new(tensor) Read from a camera: ```elixir -r = Xav.Reader.new!("/dev/video0", device?: true) +r = Xav.Reader.new!("/dev/video0", device?: true, out_format: :rgb24) {:ok, %Xav.Frame{} = frame} = Xav.Reader.next_frame(r) tensor = Xav.Frame.to_nx(frame) Kino.Image.new(tensor) diff --git a/c_src/xav/decoder.c b/c_src/xav/decoder.c index ab088a3..0bac93a 100644 --- a/c_src/xav/decoder.c +++ b/c_src/xav/decoder.c @@ -9,39 +9,18 @@ struct Decoder *decoder_alloc() { decoder->codec = NULL; decoder->c = NULL; - decoder->out_format = AV_PIX_FMT_NONE; return decoder; } -int decoder_init(struct Decoder *decoder, const char *codec, const char* out_format) { - if (strcmp(codec, "opus") == 0) { - decoder->media_type = AVMEDIA_TYPE_AUDIO; - decoder->codec = avcodec_find_decoder(AV_CODEC_ID_OPUS); - } else if (strcmp(codec, "vp8") == 0) { - decoder->media_type = AVMEDIA_TYPE_VIDEO; - decoder->codec = avcodec_find_decoder(AV_CODEC_ID_VP8); - } else if (strcmp(codec, "h264") == 0) { - decoder->media_type = AVMEDIA_TYPE_VIDEO; - decoder->codec = avcodec_find_decoder(AV_CODEC_ID_H264); - } else if (strcmp(codec, "h265") == 0) { - decoder->media_type = AVMEDIA_TYPE_VIDEO; - decoder->codec = avcodec_find_decoder(AV_CODEC_ID_HEVC); - } else { - return -1; - } +int decoder_init(struct Decoder *decoder, enum AVMediaType media_type, enum AVCodecID codec_id) { + decoder->media_type = media_type; + decoder->codec = avcodec_find_decoder(codec_id); if (!decoder->codec) { return -1; } - if(decoder->media_type == AVMEDIA_TYPE_VIDEO && strcmp(out_format, "nil") != 0) { - decoder->out_format = av_get_pix_fmt(out_format); - if (decoder->out_format == AV_PIX_FMT_NONE) { - return -1; - } - } - decoder->c = avcodec_alloc_context3(decoder->codec); if (!decoder->c) { return -1; @@ -74,7 +53,7 @@ int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame) { return avcodec_receive_frame(decoder->c, frame); } -int decoder_flush(struct Decoder *decoder, AVFrame **frames, int *frames_count) { +int decoder_flush(struct Decoder *decoder, AVFrame **frames, int *frames_count) { int ret = avcodec_send_packet(decoder->c, NULL); if (ret != 0) { return ret; diff --git a/c_src/xav/decoder.h b/c_src/xav/decoder.h index d00ef06..67bd675 100644 --- a/c_src/xav/decoder.h +++ b/c_src/xav/decoder.h @@ -8,7 +8,6 @@ struct Decoder { enum AVMediaType media_type; - enum AVPixelFormat out_format; AVFrame *frame; AVPacket *pkt; const AVCodec *codec; @@ -17,7 +16,7 @@ struct Decoder { struct Decoder *decoder_alloc(); -int decoder_init(struct Decoder *decoder, const char *codec, const char* out_format); +int decoder_init(struct Decoder *decoder, enum AVMediaType media_type, enum AVCodecID codec_id); int decoder_decode(struct Decoder *decoder, AVPacket *pkt, AVFrame *frame); diff --git a/c_src/xav/utils.c b/c_src/xav/utils.c index 76e3cba..255b14e 100644 --- a/c_src/xav/utils.c +++ b/c_src/xav/utils.c @@ -1,6 +1,6 @@ #include "utils.h" -#include #include +#include #include #include @@ -21,14 +21,14 @@ ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg) { } ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples, - int out_size, const char *out_format, int pts) { + int out_size, enum AVSampleFormat out_format, int pts) { ERL_NIF_TERM data_term; unsigned char *ptr = enif_make_new_binary(env, out_size, &data_term); memcpy(ptr, out_data[0], out_size); ERL_NIF_TERM samples_term = enif_make_int(env, out_samples); - ERL_NIF_TERM format_term = enif_make_atom(env, out_format); + ERL_NIF_TERM format_term = enif_make_atom(env, av_get_sample_fmt_name(out_format)); ERL_NIF_TERM pts_term = enif_make_int(env, pts); return enif_make_tuple(env, 4, data_term, format_term, samples_term, pts_term); @@ -39,9 +39,10 @@ ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame) { int payload_size = av_image_get_buffer_size(frame->format, frame->width, frame->height, 1); unsigned char *ptr = enif_make_new_binary(env, payload_size, &data_term); - - av_image_copy_to_buffer(ptr, payload_size, (const uint8_t *const *)frame->data, - (const int*)frame->linesize, frame->format, frame->width, frame->height, 1); + + av_image_copy_to_buffer(ptr, payload_size, (const uint8_t *const *)frame->data, + (const int *)frame->linesize, frame->format, frame->width, frame->height, + 1); ERL_NIF_TERM format_term = enif_make_atom(env, av_get_pix_fmt_name(frame->format)); ERL_NIF_TERM height_term = enif_make_int(env, frame->height); diff --git a/c_src/xav/utils.h b/c_src/xav/utils.h index 78b2f30..c91e0e2 100644 --- a/c_src/xav/utils.h +++ b/c_src/xav/utils.h @@ -20,4 +20,4 @@ ERL_NIF_TERM xav_nif_error(ErlNifEnv *env, char *reason); ERL_NIF_TERM xav_nif_raise(ErlNifEnv *env, char *msg); ERL_NIF_TERM xav_nif_video_frame_to_term(ErlNifEnv *env, AVFrame *frame); ERL_NIF_TERM xav_nif_audio_frame_to_term(ErlNifEnv *env, uint8_t **out_data, int out_samples, - int out_size, const char *out_format, int pts); + int out_size, enum AVSampleFormat out_format, int pts); diff --git a/c_src/xav/video_converter.c b/c_src/xav/video_converter.c index 9b3f42b..4d04e9f 100644 --- a/c_src/xav/video_converter.c +++ b/c_src/xav/video_converter.c @@ -1,6 +1,7 @@ #include "video_converter.h" -int video_converter_convert(AVFrame *src_frame, AVFrame **dst_frame, enum AVPixelFormat out_format) { +int video_converter_convert(AVFrame *src_frame, AVFrame **dst_frame, + enum AVPixelFormat out_format) { int ret; *dst_frame = av_frame_alloc(); @@ -26,7 +27,6 @@ int video_converter_convert(AVFrame *src_frame, AVFrame **dst_frame, enum AVPixe return ret; } - // is this (const uint8_t * const*) cast really correct? ret = sws_scale(sws_ctx, (const uint8_t *const *)src_frame->data, src_frame->linesize, 0, src_frame->height, (*dst_frame)->data, (*dst_frame)->linesize); diff --git a/c_src/xav/xav_decoder.c b/c_src/xav/xav_decoder.c index 10c188c..7b88ef0 100644 --- a/c_src/xav/xav_decoder.c +++ b/c_src/xav/xav_decoder.c @@ -17,28 +17,61 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_raise(env, "invalid_arg_count"); } + // resolve codec unsigned int codec_len; if (!enif_get_atom_length(env, argv[0], &codec_len, ERL_NIF_LATIN1)) { return xav_nif_raise(env, "failed_to_get_atom_length"); } char *codec = (char *)XAV_ALLOC((codec_len + 1) * sizeof(char *)); - if (enif_get_atom(env, argv[0], codec, codec_len + 1, ERL_NIF_LATIN1) == 0) { return xav_nif_raise(env, "failed_to_get_atom"); } + enum AVMediaType media_type; + enum AVCodecID codec_id; + if (strcmp(codec, "opus") == 0) { + media_type = AVMEDIA_TYPE_AUDIO; + codec_id = AV_CODEC_ID_OPUS; + } else if (strcmp(codec, "vp8") == 0) { + media_type = AVMEDIA_TYPE_VIDEO; + codec_id = AV_CODEC_ID_VP8; + } else if (strcmp(codec, "h264") == 0) { + media_type = AVMEDIA_TYPE_VIDEO; + codec_id = AV_CODEC_ID_H264; + } else if (strcmp(codec, "h265") == 0) { + media_type = AVMEDIA_TYPE_VIDEO; + codec_id = AV_CODEC_ID_HEVC; + } else { + return xav_nif_raise(env, "failed_to_resolve_codec"); + } + + // resolve output format unsigned int out_format_len; if (!enif_get_atom_length(env, argv[1], &out_format_len, ERL_NIF_LATIN1)) { return xav_nif_raise(env, "failed_to_get_atom_length"); } char *out_format = (char *)XAV_ALLOC((out_format_len + 1) * sizeof(char *)); - if (enif_get_atom(env, argv[1], out_format, out_format_len + 1, ERL_NIF_LATIN1) == 0) { return xav_nif_raise(env, "failed_to_get_atom"); } + enum AVPixelFormat out_video_fmt = AV_PIX_FMT_NONE; + enum AVSampleFormat out_audo_fmt = AV_SAMPLE_FMT_NONE; + if (media_type == AVMEDIA_TYPE_VIDEO && strcmp(out_format, "nil") != 0) { + out_video_fmt = av_get_pix_fmt(out_format); + if (out_video_fmt == AV_PIX_FMT_NONE) { + return xav_nif_raise(env, "unknown_out_format"); + } + } else if (media_type == AVMEDIA_TYPE_AUDIO && strcmp(out_format, "nil") != 0) { + out_audo_fmt = av_get_sample_fmt(out_format); + if (out_audo_fmt == AV_SAMPLE_FMT_NONE) { + return xav_nif_raise(env, "unknown_out_format"); + } + } + + // resolve other params int out_sample_rate; if (!enif_get_int(env, argv[2], &out_sample_rate)) { return xav_nif_raise(env, "invalid_out_sample_rate"); @@ -53,7 +86,8 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { enif_alloc_resource(xav_decoder_resource_type, sizeof(struct XavDecoder)); xav_decoder->decoder = NULL; xav_decoder->ac = NULL; - xav_decoder->out_format = out_format; + xav_decoder->out_audio_fmt = out_audo_fmt; + xav_decoder->out_video_fmt = out_video_fmt; xav_decoder->out_sample_rate = out_sample_rate; xav_decoder->out_channels = out_channels; @@ -62,31 +96,31 @@ ERL_NIF_TERM new(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_raise(env, "failed_to_allocate_decoder"); } - if (decoder_init(xav_decoder->decoder, codec, xav_decoder->out_format) != 0) { + if (decoder_init(xav_decoder->decoder, media_type, codec_id) != 0) { return xav_nif_raise(env, "failed_to_init_decoder"); } ERL_NIF_TERM decoder_term = enif_make_resource(env, xav_decoder); enif_release_resource(xav_decoder); + XAV_FREE(out_format); + return decoder_term; } -ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame* frame) { +ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame *frame) { ERL_NIF_TERM frame_term; int ret; if (xav_decoder->decoder->media_type == AVMEDIA_TYPE_VIDEO) { XAV_LOG_DEBUG("Converting video to RGB"); - int out_pix_fmt = xav_decoder->decoder->out_format; - - if (out_pix_fmt == AV_PIX_FMT_NONE) { + if (xav_decoder->out_video_fmt == AV_PIX_FMT_NONE) { return xav_nif_video_frame_to_term(env, frame); } AVFrame *dst_frame; - ret = video_converter_convert(frame, &dst_frame, out_pix_fmt); + ret = video_converter_convert(frame, &dst_frame, xav_decoder->out_video_fmt); if (ret <= 0) { return xav_nif_raise(env, "failed_to_decode"); } @@ -104,7 +138,7 @@ ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame* fr if (xav_decoder->ac == NULL) { ret = init_audio_converter(xav_decoder); if (ret < 0) { - return xav_nif_raise(env, "failed_to_init_converter");; + return xav_nif_raise(env, "failed_to_init_converter"); } } @@ -113,15 +147,8 @@ ERL_NIF_TERM convert(ErlNifEnv *env, struct XavDecoder *xav_decoder, AVFrame* fr return xav_nif_raise(env, "failed_to_decode"); } - const char *out_format = av_get_sample_fmt_name(xav_decoder->ac->out_sample_fmt); - - if (strcmp(out_format, "flt") == 0) { - out_format = "f32"; - } else if (strcmp(out_format, "dbl") == 0) { - out_format = "f64"; - } - - frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, frame->pts); + frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, + xav_decoder->out_audio_fmt, frame->pts); av_freep(&out_data[0]); } @@ -229,23 +256,12 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) { out_sample_rate = xav_decoder->out_sample_rate; } - enum AVSampleFormat out_sample_fmt; - if (strcmp(xav_decoder->out_format, "u8") == 0) { - out_sample_fmt = AV_SAMPLE_FMT_U8; - } else if (strcmp(xav_decoder->out_format, "s16") == 0) { - out_sample_fmt = AV_SAMPLE_FMT_S16; - } else if (strcmp(xav_decoder->out_format, "s32") == 0) { - out_sample_fmt = AV_SAMPLE_FMT_S32; - } else if (strcmp(xav_decoder->out_format, "s64") == 0) { - out_sample_fmt = AV_SAMPLE_FMT_S64; - } else if (strcmp(xav_decoder->out_format, "f32") == 0) { - out_sample_fmt = AV_SAMPLE_FMT_FLT; - } else if (strcmp(xav_decoder->out_format, "f64") == 0) { - out_sample_fmt = AV_SAMPLE_FMT_DBL; - } else if (strcmp(xav_decoder->out_format, "nil") == 0) { - out_sample_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0); - } else { - return -1; + // If user didn't request any specific format, + // just take the original format but in the packed form. + // We need to call this function here, as in the decoder_init we don't know + // what is the sample_fmt yet. + if (xav_decoder->out_audio_fmt == AV_SAMPLE_FMT_NONE) { + xav_decoder->out_audio_fmt = av_get_alt_sample_fmt(xav_decoder->decoder->c->sample_fmt, 0); } struct ChannelLayout in_chlayout, out_chlayout; @@ -267,7 +283,7 @@ static int init_audio_converter(struct XavDecoder *xav_decoder) { return audio_converter_init(xav_decoder->ac, in_chlayout, xav_decoder->decoder->c->sample_rate, xav_decoder->decoder->c->sample_fmt, out_chlayout, out_sample_rate, - out_sample_fmt); + xav_decoder->out_audio_fmt); } void free_xav_decoder(ErlNifEnv *env, void *obj) { diff --git a/c_src/xav/xav_decoder.h b/c_src/xav/xav_decoder.h index 08e01fe..d640381 100644 --- a/c_src/xav/xav_decoder.h +++ b/c_src/xav/xav_decoder.h @@ -1,10 +1,13 @@ #include "audio_converter.h" #include "decoder.h" +#include + struct XavDecoder { struct Decoder *decoder; struct AudioConverter *ac; - char *out_format; + enum AVPixelFormat out_video_fmt; + enum AVSampleFormat out_audio_fmt; int out_sample_rate; int out_channels; }; \ No newline at end of file diff --git a/c_src/xav/xav_reader.c b/c_src/xav/xav_reader.c index fcaf0d8..f1c8fdb 100644 --- a/c_src/xav/xav_reader.c +++ b/c_src/xav/xav_reader.c @@ -169,16 +169,9 @@ ERL_NIF_TERM next_frame(ErlNifEnv *env, int argc, const ERL_NIF_TERM argv[]) { return xav_nif_raise(env, "failed_to_read"); } - const char *out_format = av_get_sample_fmt_name(xav_reader->ac->out_sample_fmt); - - if (strcmp(out_format, "flt") == 0) { - out_format = "f32"; - } else if (strcmp(out_format, "dbl") == 0) { - out_format = "f64"; - } - - frame_term = xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, out_format, - xav_reader->reader->frame->pts); + frame_term = + xav_nif_audio_frame_to_term(env, out_data, out_samples, out_size, + xav_reader->ac->out_sample_fmt, xav_reader->reader->frame->pts); av_freep(&out_data[0]); } diff --git a/lib/decoder.ex b/lib/decoder.ex index ca3cef9..9825ed4 100644 --- a/lib/decoder.ex +++ b/lib/decoder.ex @@ -82,6 +82,7 @@ defmodule Xav.Decoder do :ok {:ok, {data, format, width, height, pts}} -> + format = normalize_format(format) {:ok, Xav.Frame.new(data, format, width, height, pts)} # Sometimes, audio converter might not return data immediately. @@ -89,6 +90,7 @@ defmodule Xav.Decoder do :ok {:ok, {data, format, samples, pts}} -> + format = normalize_format(format) {:ok, Xav.Frame.new(data, format, samples, pts)} {:error, _reason} = error -> @@ -123,4 +125,9 @@ defmodule Xav.Decoder do {:error, reason} -> raise "Failed to flush decoder: #{inspect(reason)}" end end + + # Use the same formats as Nx + defp normalize_format(:flt), do: :f32 + defp normalize_format(:dbl), do: :f64 + defp normalize_format(other), do: other end diff --git a/lib/reader.ex b/lib/reader.ex index 67c9a5d..f825b10 100644 --- a/lib/reader.ex +++ b/lib/reader.ex @@ -117,6 +117,7 @@ defmodule Xav.Reader do def next_frame(%__MODULE__{reader: ref} = reader) do case Xav.Reader.NIF.next_frame(ref) do {:ok, {data, format, width, height, pts}} -> + format = normalize_format(format) {:ok, Xav.Frame.new(data, format, width, height, pts)} {:ok, {"", _format, _samples, _pts}} -> @@ -125,6 +126,7 @@ defmodule Xav.Reader do next_frame(reader) {:ok, {data, format, samples, pts}} -> + format = normalize_format(format) {:ok, Xav.Frame.new(data, format, samples, pts)} {:error, :eof} = err -> @@ -173,4 +175,9 @@ defmodule Xav.Reader do defp to_int(:audio), do: 0 defp to_int(true), do: 1 defp to_int(false), do: 0 + + # Use the same formats as Nx + defp normalize_format(:flt), do: :f32 + defp normalize_format(:dbl), do: :f64 + defp normalize_format(other), do: other end diff --git a/test/decoder_test.exs b/test/decoder_test.exs index 9baf81f..0011cbd 100644 --- a/test/decoder_test.exs +++ b/test/decoder_test.exs @@ -366,5 +366,4 @@ defmodule Xav.DecoderTest do assert byte_size(frame) == 640 * 480 * 3 end end - end end